mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-5149 update unicode tools after cvs problems.
X-SVN-Rev: 19520
This commit is contained in:
parent
ea4cd7f0fa
commit
557bade86a
33 changed files with 964 additions and 346 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
|
||||
* $Date: 2005/06/08 01:44:48 $
|
||||
* $Revision: 1.42 $
|
||||
* $Date: 2006/04/05 22:12:46 $
|
||||
* $Revision: 1.43 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -18,6 +18,7 @@ import com.ibm.icu.text.UTF16;
|
|||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.CanonicalIterator;
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
|
||||
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
||||
import com.ibm.icu.dev.test.util.UnicodePropertySource;
|
||||
import com.ibm.icu.impl.UCharacterProperty;
|
||||
|
@ -33,7 +34,6 @@ import java.text.DateFormat;
|
|||
import java.text.SimpleDateFormat;
|
||||
|
||||
import com.ibm.text.UCD.*;
|
||||
import com.ibm.text.UCD.UCD_Types;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.text.UCD.Normalizer;
|
||||
|
||||
|
@ -4104,8 +4104,8 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
bf.setLineSeparator("<br>\r\n");
|
||||
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
|
||||
bf.setUnicodePropertyFactory(ups);
|
||||
bf.setShowLiteral(bf.toHTML);
|
||||
bf.setFixName(bf.toHTML);
|
||||
bf.setShowLiteral(TransliteratorUtilities.toHTML);
|
||||
bf.setFixName(TransliteratorUtilities.toHTML);
|
||||
UCD ucd = Default.ucd();
|
||||
UnicodeProperty cat = ups.getProperty("gc");
|
||||
UnicodeSet ucd410 = cat.getSet("Cn")
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
# The data supports both implementations that require simple case foldings
|
||||
# (where string lengths don't change), and implementations that allow full case folding
|
||||
# (where string lengths may grow). Note that where they can be supported, the
|
||||
# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match.
|
||||
# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match.
|
||||
#
|
||||
# All code points not listed in this file map to themselves.
|
||||
#
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
|
||||
* $Date: 2005/11/01 00:10:53 $
|
||||
* $Revision: 1.17 $
|
||||
* $Date: 2006/04/05 22:12:44 $
|
||||
* $Revision: 1.18 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -840,6 +840,13 @@ public final class ConvertUCD implements UCD_Types {
|
|||
|
||||
} else if (fieldName.equals("gc")) {
|
||||
uData.generalCategory = Utility.lookup(fieldValue, UCD_Names.GENERAL_CATEGORY, true);
|
||||
// if (major >= 5 && uData.script == Unknown_Script
|
||||
// && uData.generalCategory != Cn
|
||||
// && uData.generalCategory != Cs
|
||||
// && uData.generalCategory != Co) {
|
||||
// uData.script = COMMON_SCRIPT;
|
||||
// System.out.println("Resetting to Common Script: " + Utility.hex(uData.codePoint));
|
||||
// }
|
||||
} else if (fieldName.equals("bc")) {
|
||||
uData.bidiClass = Utility.lookup(fieldValue, UCD_Names.BIDI_CLASS, true);
|
||||
} else if (fieldName.equals("dt")) {
|
||||
|
@ -878,8 +885,17 @@ public final class ConvertUCD implements UCD_Types {
|
|||
uData.numericValue = Utility.doubleFrom(fieldValue);
|
||||
} else if (fieldName.equals("cc")) {
|
||||
uData.combiningClass = (byte)Utility.intFrom(fieldValue);
|
||||
if (uData.combiningClass == 9 && major >= 5) {
|
||||
System.out.println("setting Grapheme_Link " + Utility.hex(uData.codePoint) + "\t" + uData.name);
|
||||
uData.binaryProperties |= (1<<GraphemeLink);
|
||||
System.out.println(uData);
|
||||
}
|
||||
} else if (fieldName.equals("bp")) {
|
||||
uData.binaryProperties = (byte)Utility.longFrom(fieldValue);
|
||||
// if (major >= 5 && (uData.binaryProperties & 1<<Noncharacter_Code_Point) != 0) {
|
||||
// uData.script = Unknown_Script;
|
||||
// }
|
||||
System.out.println("Resetting: " + uData);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unknown fieldName");
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
|
||||
* $Date: 2005/03/30 17:19:32 $
|
||||
* $Revision: 1.13 $
|
||||
* $Date: 2006/04/05 22:12:44 $
|
||||
* $Revision: 1.14 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -310,7 +310,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
|
||||
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
|
||||
|
||||
UnicodeDataFile fc = UnicodeDataFile.openHTMLAndWriteHeader("auxiliary\\", fileName + "BreakTest");
|
||||
UnicodeDataFile fc = UnicodeDataFile.openHTMLAndWriteHeader("DerivedData\\auxiliary\\", fileName + "BreakTest");
|
||||
PrintWriter out = fc.out;
|
||||
|
||||
/* PrintWriter out = Utility.openPrintWriter("auxiliary\\"
|
||||
|
@ -354,7 +354,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
String[] testCase = new String[50];
|
||||
// do main test
|
||||
|
||||
UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader("auxiliary\\", fileName + "BreakTest"
|
||||
UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader("DerivedData\\auxiliary\\", fileName + "BreakTest"
|
||||
+ (shortVersion ? "_SHORT" : ""));
|
||||
PrintWriter out = fc.out;
|
||||
/* PrintWriter out = Utility.openPrintWriter("TR29\\" + fileName + "BreakTest"
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
|
||||
* $Date: 2005/03/26 05:40:04 $
|
||||
* $Revision: 1.17 $
|
||||
* $Date: 2006/04/05 22:12:45 $
|
||||
* $Revision: 1.18 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -585,8 +585,8 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
out.println("# SpecialCasing" + UnicodeDataFile.getFileSuffix(false));
|
||||
out.println(UnicodeDataFile.generateDateLine());
|
||||
out.println("#");
|
||||
Utility.appendFile("SpecialCasingHeader.txt", Utility.UTF8, out);
|
||||
*/
|
||||
//Utility.appendFile("com/ibm/text/UCD/SpecialCasingHeader.txt", Utility.UTF8, out);
|
||||
|
||||
Iterator it = sorted.keySet().iterator();
|
||||
int lastOrder = -1;
|
||||
|
@ -609,7 +609,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
case 3: out.println("# Ligatures"); break;
|
||||
case 4: skipLine = true; break;
|
||||
case 5: out.println("# No corresponding uppercase precomposed character"); break;
|
||||
case 6: Utility.appendFile("SpecialCasingIota.txt", Utility.UTF8, out); break;
|
||||
case 6: Utility.appendFile("com/ibm/text/UCD/SpecialCasingIota.txt", Utility.UTF8, out); break;
|
||||
case 7: out.println("# Some characters with YPOGEGRAMMENI also have no corresponding titlecases"); break;
|
||||
case 8: skipLine = true; break;
|
||||
}
|
||||
|
@ -617,7 +617,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
}
|
||||
out.println(line);
|
||||
}
|
||||
//Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out);
|
||||
Utility.appendFile("com/ibm/text/UCD/SpecialCasingFooter.txt", Utility.UTF8, out);
|
||||
udf.close();
|
||||
//Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
|
||||
* $Date: 2005/11/19 05:39:39 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2006/04/05 22:12:45 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -38,6 +38,7 @@ import com.ibm.icu.dev.demo.translit.InfoDialog;
|
|||
import com.ibm.icu.dev.test.util.ArrayComparator;
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.ICUPropertyFactory;
|
||||
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
|
||||
import com.ibm.icu.dev.test.util.UnicodeLabel;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
||||
|
@ -423,15 +424,15 @@ public class GenerateConfusables {
|
|||
BagFormatter bf = new BagFormatter();
|
||||
bf.setUnicodePropertyFactory(ups);
|
||||
bf.setLabelSource(null);
|
||||
bf.setShowLiteral(bf.toHTMLControl);
|
||||
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
|
||||
bf.setMergeRanges(true);
|
||||
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "review.txt");
|
||||
//reviews.putAll(UNASSIGNED, "");
|
||||
out.print("\uFEFF");
|
||||
out.println("# Review List for IDN");
|
||||
out.println("# $Revision: 1.9 $");
|
||||
out.println("# $Date: 2005/11/19 05:39:39 $");
|
||||
out.println("# $Revision: 1.10 $");
|
||||
out.println("# $Date: 2006/04/05 22:12:45 $");
|
||||
out.println("");
|
||||
|
||||
UnicodeSet fullSet = reviews.getSet("").complement();
|
||||
|
@ -478,7 +479,7 @@ public class GenerateConfusables {
|
|||
BagFormatter bf = new BagFormatter();
|
||||
bf.setUnicodePropertyFactory(ups);
|
||||
bf.setLabelSource(null);
|
||||
bf.setShowLiteral(bf.toHTMLControl);
|
||||
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
|
||||
bf.setMergeRanges(true);
|
||||
|
||||
UnicodeSet letters = new UnicodeSet("[[:Alphabetic:][:Mark:][:Nd:]]");
|
||||
|
@ -486,8 +487,8 @@ public class GenerateConfusables {
|
|||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "idnchars.txt");
|
||||
|
||||
out.println("# Recommended Identifier Profiles for IDN");
|
||||
out.println("# $Revision: 1.9 $");
|
||||
out.println("# $Date: 2005/11/19 05:39:39 $");
|
||||
out.println("# $Revision: 1.10 $");
|
||||
out.println("# $Date: 2006/04/05 22:12:45 $");
|
||||
|
||||
out.println("");
|
||||
out.println("# Output Characters");
|
||||
|
@ -549,15 +550,15 @@ public class GenerateConfusables {
|
|||
BagFormatter bf = new BagFormatter();
|
||||
bf.setUnicodePropertyFactory(ups);
|
||||
bf.setLabelSource(null);
|
||||
bf.setShowLiteral(bf.toHTMLControl);
|
||||
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
|
||||
bf.setMergeRanges(true);
|
||||
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(outdir,
|
||||
"xidmodifications.txt");
|
||||
|
||||
out.println("# Security Profile for General Identifiers");
|
||||
out.println("# $Revision: 1.9 $");
|
||||
out.println("# $Date: 2005/11/19 05:39:39 $");
|
||||
out.println("# $Revision: 1.10 $");
|
||||
out.println("# $Date: 2006/04/05 22:12:45 $");
|
||||
out.println("");
|
||||
|
||||
out.println("# Characters restricted");
|
||||
|
@ -613,8 +614,8 @@ public class GenerateConfusables {
|
|||
//someRemovals = removals;
|
||||
out = BagFormatter.openUTF8Writer(outdir, "draft-restrictions.txt");
|
||||
out.println("# Characters restricted in domain names");
|
||||
out.println("# $Revision: 1.9 $");
|
||||
out.println("# $Date: 2005/11/19 05:39:39 $");
|
||||
out.println("# $Revision: 1.10 $");
|
||||
out.println("# $Date: 2006/04/05 22:12:45 $");
|
||||
out.println("#");
|
||||
out.println("# This file contains a draft list of characters for use in");
|
||||
out.println("# UTR #36: Unicode Security Considerations");
|
||||
|
@ -1148,8 +1149,8 @@ public class GenerateConfusables {
|
|||
public void writeSource(String directory, String filename) throws IOException {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
|
||||
out.println("# Source File for IDN Confusables");
|
||||
out.println("# $Revision: 1.9 $");
|
||||
out.println("# $Date: 2005/11/19 05:39:39 $");
|
||||
out.println("# $Revision: 1.10 $");
|
||||
out.println("# $Date: 2006/04/05 22:12:45 $");
|
||||
out.println("");
|
||||
dataMixedAnycase.writeSource(out);
|
||||
out.close();
|
||||
|
@ -1159,8 +1160,8 @@ public class GenerateConfusables {
|
|||
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
|
||||
out.print('\uFEFF');
|
||||
out.println("# Recommended confusable mapping for IDN");
|
||||
out.println("# $Revision: 1.9 $");
|
||||
out.println("# $Date: 2005/11/19 05:39:39 $");
|
||||
out.println("# $Revision: 1.10 $");
|
||||
out.println("# $Date: 2006/04/05 22:12:45 $");
|
||||
out.println("");
|
||||
|
||||
if (appendFile) {
|
||||
|
@ -1347,7 +1348,7 @@ public class GenerateConfusables {
|
|||
}
|
||||
}
|
||||
|
||||
static class MyCollectionFilter implements CollectionUtilities.Filter {
|
||||
static class MyCollectionFilter implements CollectionUtilities.ObjectMatcher {
|
||||
UnicodeSet outputAllowed;
|
||||
int minLength;
|
||||
public boolean matches(Object o) {
|
||||
|
@ -1368,8 +1369,8 @@ public class GenerateConfusables {
|
|||
UnicodeSet representable = new UnicodeSet();
|
||||
out.print('\uFEFF');
|
||||
out.println("# Summary: Recommended confusable mapping for IDN");
|
||||
out.println("# $Revision: 1.9 $");
|
||||
out.println("# $Date: 2005/11/19 05:39:39 $");
|
||||
out.println("# $Revision: 1.10 $");
|
||||
out.println("# $Date: 2006/04/05 22:12:45 $");
|
||||
out.println("");
|
||||
MyEquivalenceClass data = dataMixedAnycase;
|
||||
Set items = data.getOrderedExplicitItems();
|
||||
|
@ -1446,7 +1447,7 @@ public class GenerateConfusables {
|
|||
representable.removeAll(script);
|
||||
BagFormatter bf = new BagFormatter();
|
||||
bf.setValueSource(ups.getProperty("script"));
|
||||
bf.setShowLiteral(bf.toHTMLControl);
|
||||
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
|
||||
bf.showSetNames(out, representable);
|
||||
}
|
||||
out.close();
|
||||
|
@ -1493,8 +1494,8 @@ public class GenerateConfusables {
|
|||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
|
||||
out.print('\uFEFF');
|
||||
out.println("# Summary: Whole-Script Confusables");
|
||||
out.println("# $Revision: 1.9 $");
|
||||
out.println("# $Date: 2005/11/19 05:39:39 $");
|
||||
out.println("# $Revision: 1.10 $");
|
||||
out.println("# $Date: 2006/04/05 22:12:45 $");
|
||||
out.println("# This data is used for determining whether a strings is a");
|
||||
out.println("# whole-script or mixed-script confusable.");
|
||||
out.println("# The mappings here ignore common and inherited script characters,");
|
||||
|
@ -1539,7 +1540,7 @@ public class GenerateConfusables {
|
|||
script_set[i] = new UnicodeSet("[:script=" + UScript.getName(i) + ":]"); // ugly hack
|
||||
}
|
||||
bf.setValueSource(ups.getProperty("script"));
|
||||
bf.setShowLiteral(bf.toHTMLControl);
|
||||
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
|
||||
bf.setLabelSource(UnicodeLabel.NULL);
|
||||
}
|
||||
WholeScript(UnicodeSet filterSet, String label) {
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
|
||||
* $Date: 2005/10/11 19:39:15 $
|
||||
* $Revision: 1.39 $
|
||||
* $Date: 2006/04/05 22:12:44 $
|
||||
* $Revision: 1.40 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -756,41 +756,41 @@ public class GenerateData implements UCD_Types {
|
|||
|
||||
//log.println("# " + fileName + UnicodeDataFile.getFileSuffix(false));
|
||||
//log.println(UnicodeDataFile.generateDateLine());
|
||||
log.println("#");
|
||||
log.println("# Normalization Test Suite");
|
||||
log.println("# Format:");
|
||||
log.println("#");
|
||||
log.println("# Columns (c1, c2,...) are separated by semicolons");
|
||||
log.println("# Comments are indicated with hash marks");
|
||||
log.println("#");
|
||||
log.println("# CONFORMANCE:");
|
||||
log.println("# 1. The following invariants must be true for all conformant implementations");
|
||||
log.println("#");
|
||||
log.println("# NFC");
|
||||
log.println("# c2 == NFC(c1) == NFC(c2) == NFC(c3)");
|
||||
log.println("# c4 == NFC(c4) == NFC(c5)");
|
||||
log.println("#");
|
||||
log.println("# NFD");
|
||||
log.println("# c3 == NFD(c1) == NFD(c2) == NFD(c3)");
|
||||
log.println("# c5 == NFD(c4) == NFD(c5)");
|
||||
log.println("#");
|
||||
log.println("# NFKC");
|
||||
log.println("# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)");
|
||||
log.println("#");
|
||||
log.println("# NFKD");
|
||||
log.println("# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)");
|
||||
log.println("#");
|
||||
log.println("# 2. For every code point X assigned in this version of Unicode that is not specifically");
|
||||
log.println("# listed in Part 1, the following invariants must be true for all conformant");
|
||||
log.println("# implementations:");
|
||||
log.println("#");
|
||||
log.println("# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)");
|
||||
// log.println("#");
|
||||
// log.println("# Normalization Test Suite");
|
||||
// log.println("# Format:");
|
||||
// log.println("#");
|
||||
// log.println("# Columns (c1, c2,...) are separated by semicolons");
|
||||
// log.println("# Comments are indicated with hash marks");
|
||||
// log.println("#");
|
||||
// log.println("# CONFORMANCE:");
|
||||
// log.println("# 1. The following invariants must be true for all conformant implementations");
|
||||
// log.println("#");
|
||||
// log.println("# NFC");
|
||||
// log.println("# c2 == NFC(c1) == NFC(c2) == NFC(c3)");
|
||||
// log.println("# c4 == NFC(c4) == NFC(c5)");
|
||||
// log.println("#");
|
||||
// log.println("# NFD");
|
||||
// log.println("# c3 == NFD(c1) == NFD(c2) == NFD(c3)");
|
||||
// log.println("# c5 == NFD(c4) == NFD(c5)");
|
||||
// log.println("#");
|
||||
// log.println("# NFKC");
|
||||
// log.println("# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)");
|
||||
// log.println("#");
|
||||
// log.println("# NFKD");
|
||||
// log.println("# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)");
|
||||
// log.println("#");
|
||||
// log.println("# 2. For every code point X assigned in this version of Unicode that is not specifically");
|
||||
// log.println("# listed in Part 1, the following invariants must be true for all conformant");
|
||||
// log.println("# implementations:");
|
||||
// log.println("#");
|
||||
// log.println("# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)");
|
||||
|
||||
System.out.println("Writing Part 1");
|
||||
|
||||
log.println("#");
|
||||
log.println("@Part0 # Specific cases");
|
||||
log.println("#");
|
||||
// log.println("#");
|
||||
// log.println("@Part0 # Specific cases");
|
||||
// log.println("#");
|
||||
|
||||
for (int j = 0; j < testSuiteCases.length; ++j) {
|
||||
writeLine(testSuiteCases[j], log, false);
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java,v $
|
||||
* $Date: 2005/05/27 21:40:51 $
|
||||
* $Revision: 1.1 $
|
||||
* $Date: 2006/04/05 22:12:45 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -117,7 +117,7 @@ public final class GenerateNamedSequences implements UCD_Types {
|
|||
"@date@", Default.getDate(),
|
||||
"@table@", table};
|
||||
|
||||
Utility.appendFile("NamedSequences-Template.html", Utility.UTF8, out, replacementList);
|
||||
Utility.appendFile("com/ibm/text/UCD/NamedSequences-Template.html", Utility.UTF8, out, replacementList);
|
||||
|
||||
out.close();
|
||||
//Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]);
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java,v $
|
||||
* $Date: 2005/10/11 19:39:15 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2006/04/05 22:12:44 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -104,18 +104,31 @@ public final class GenerateStandardizedVariants implements UCD_Types {
|
|||
|
||||
String version = Default.ucd().getVersion();
|
||||
int lastDot = version.lastIndexOf('.');
|
||||
String updateDirectory = version.substring(0,lastDot) + "-Update";
|
||||
int updateV = version.charAt(version.length()-1) - '0';
|
||||
if (updateV != 0) updateDirectory += (char)('1' + updateV);
|
||||
if (DEBUG) System.out.println("updateDirectory: " + updateDirectory);
|
||||
String updateDirectory;
|
||||
String partialFilename;
|
||||
if (version.compareTo("4.1.0") < 0) {
|
||||
updateDirectory = version.substring(0,lastDot) + "-Update";
|
||||
int updateV = version.charAt(version.length()-1) - '0';
|
||||
if (updateV != 0) updateDirectory += (char)('1' + updateV);
|
||||
if (DEBUG) System.out.println("updateDirectory: " + updateDirectory);
|
||||
partialFilename = "StandardizedVariants-" + Default.ucd().getVersion();
|
||||
} else if (version.compareTo("4.1.0") == 0) {
|
||||
updateDirectory = version.substring(0,lastDot) + "/ucd";
|
||||
partialFilename = "StandardizedVariants";
|
||||
} else {
|
||||
updateDirectory = version + "/ucd";
|
||||
partialFilename = "StandardizedVariants";
|
||||
}
|
||||
|
||||
|
||||
String[] replacementList = {
|
||||
"@revision@", Default.ucd().getVersion(),
|
||||
"@updateDirectory@", updateDirectory,
|
||||
"@filename@", partialFilename,
|
||||
"@date@", Default.getDate(),
|
||||
"@table@", table};
|
||||
|
||||
Utility.appendFile("StandardizedVariants-Template.html", Utility.UTF8, out, replacementList);
|
||||
Utility.appendFile("com/ibm/text/UCD/StandardizedVariants-Template.html", Utility.UTF8, out, replacementList);
|
||||
|
||||
out.close();
|
||||
//Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]);
|
||||
|
|
|
@ -15,6 +15,7 @@ import java.util.Set;
|
|||
import java.util.TreeSet;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
|
||||
import com.ibm.icu.dev.test.util.UnicodeLabel;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap.Composer;
|
||||
|
@ -91,7 +92,7 @@ class GenerateStringPrep implements UCD_Types {
|
|||
|
||||
void genStringPrep() throws IOException {
|
||||
//showScriptToBlock();
|
||||
bf.setShowLiteral(BagFormatter.toHTMLControl);
|
||||
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
|
||||
bf.setUnicodePropertyFactory(ups);
|
||||
//bf.setValueSource(UnicodeLabel.NULL);
|
||||
if (false) {
|
||||
|
@ -508,7 +509,7 @@ class GenerateStringPrep implements UCD_Types {
|
|||
}
|
||||
return "<span title='" + ucd.getCodeAndName(string) + "'>"
|
||||
+ pad1
|
||||
+ BagFormatter.toHTMLControl.transliterate(string)
|
||||
+ TransliteratorUtilities.toHTMLControl.transliterate(string)
|
||||
+ pad
|
||||
+ "</span> ";
|
||||
}
|
||||
|
|
142
tools/unicodetools/com/ibm/text/UCD/IDNTester.java
Normal file
142
tools/unicodetools/com/ibm/text/UCD/IDNTester.java
Normal file
|
@ -0,0 +1,142 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.impl.PrettyPrinter;
|
||||
import com.ibm.icu.text.IDNA;
|
||||
import com.ibm.icu.text.StringPrepParseException;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.text.utility.Utility;
|
||||
|
||||
public class IDNTester {
|
||||
static StringBuffer inbuffer = new StringBuffer();
|
||||
static StringBuffer intermediate, outbuffer;
|
||||
static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
|
||||
static UnicodeSet IDNInputOnly = new UnicodeSet();
|
||||
static UnicodeSet IDNOutput = new UnicodeSet();
|
||||
static boolean initialized = false;
|
||||
static UnicodeSet IDInputOnly32 = new UnicodeSet();
|
||||
static UnicodeSet IDOutput32 = new UnicodeSet();
|
||||
static UnicodeSet IDInputOnly50 = new UnicodeSet();
|
||||
static UnicodeSet IDOutput50 = new UnicodeSet();
|
||||
static PrettyPrinter pp = new PrettyPrinter();
|
||||
static PrintWriter pw;
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
initialize();
|
||||
pw = BagFormatter.openUTF8Writer(Utility.GEN_DIR, "idnCount.html");
|
||||
pw.println("<html><body>");
|
||||
showSet("IDN InputOnly: ", IDNInputOnly);
|
||||
showSet("IDN Output: ", IDNOutput);
|
||||
showSet("ID InputOnly, U3.2: ", IDInputOnly32);
|
||||
showSet("ID Output, U3.2: ", IDOutput32);
|
||||
|
||||
showSet("IDN Output - ID Output, U3.2: ", new UnicodeSet(IDNOutput).removeAll(IDOutput32));
|
||||
showSet("IDN Output & ID Output, U3.2: ", new UnicodeSet(IDNOutput).retainAll(IDOutput32));
|
||||
showSet("ID Output - IDN Output, U3.2: ", new UnicodeSet(IDOutput32).removeAll(IDNOutput));
|
||||
|
||||
showSet("ID InputOnly, U5.0: ", IDInputOnly50);
|
||||
showSet("ID Output, U5.0: ", IDOutput50);
|
||||
showSet("ID Output, U5.0 - U3.2: ", new UnicodeSet(IDOutput50).removeAll(IDOutput32));
|
||||
|
||||
pw.println("</body></html>");
|
||||
|
||||
pw.close();
|
||||
}
|
||||
|
||||
public static void showSet(String title, UnicodeSet set) {
|
||||
pw.println("<h2>" + title + set.size() + "</h2>" + "<p>" + pp.toPattern(set) + "</p>");
|
||||
pw.println();
|
||||
}
|
||||
|
||||
static UnicodeSet getIDNInput() {
|
||||
if (!initialized) initialize();
|
||||
return IDNInputOnly;
|
||||
}
|
||||
|
||||
static UnicodeSet getIDNOutput() {
|
||||
if (!initialized) initialize();
|
||||
return IDNInputOnly;
|
||||
}
|
||||
|
||||
private static void initialize() {
|
||||
UnicodeSet oddballs = new UnicodeSet("[\u034F \u180B-\u180D \uFE00-\uFE0F _]");
|
||||
UCD U32 = UCD.make("3.2.0");
|
||||
Normalizer nfkc32 = new Normalizer(Normalizer.NFKC, "3.2.0");
|
||||
UCDProperty xid32 = DerivedProperty.make(UCD.Mod_ID_Continue_NO_Cf,U32);
|
||||
UnicodeSet IDInput32 = xid32.getSet();
|
||||
IDInput32.add('-').removeAll(oddballs);
|
||||
|
||||
UCD U50 = UCD.make("5.0.0");
|
||||
Normalizer nfkc50 = new Normalizer(Normalizer.NFKC, "5.0.0");
|
||||
UCDProperty xid50 = DerivedProperty.make(UCD.Mod_ID_Continue_NO_Cf,U50);
|
||||
UnicodeSet IDInput50 = xid50.getSet();
|
||||
IDInput50.add('-').removeAll(oddballs);
|
||||
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if ((i & 0xFFF) == 0) {
|
||||
System.out.println(i);
|
||||
System.out.flush();
|
||||
}
|
||||
int type = getIDNAType(i);
|
||||
if (type == OK) {
|
||||
IDNOutput.add(i);
|
||||
} else if (type != ILLEGAL) {
|
||||
IDNInputOnly.add(i);
|
||||
}
|
||||
if (IDInput32.contains(i)) {
|
||||
splitSet(IDInputOnly32, IDOutput32, U32, nfkc32, i);
|
||||
}
|
||||
if (IDInput50.contains(i)) {
|
||||
splitSet(IDInputOnly50, IDOutput50, U50, nfkc50, i);
|
||||
}
|
||||
}
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
private static void splitSet(UnicodeSet inputOnlySet, UnicodeSet outputSet, UCD ucd, Normalizer nfkc, int i) {
|
||||
if (i < 0x7F) {
|
||||
outputSet.add(i);
|
||||
return;
|
||||
}
|
||||
String v = UTF16.valueOf(i);
|
||||
String s = ucd.getCase(i, UCD.FULL, UCD.FOLD);
|
||||
if (s.equals(v)) {
|
||||
s = nfkc.normalize(s);
|
||||
if (s.equals(v)) {
|
||||
s = ucd.getCase(s, UCD.FULL, UCD.FOLD);
|
||||
if (s.equals(v)) {
|
||||
outputSet.add(i);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
inputOnlySet.add(i);
|
||||
}
|
||||
|
||||
static public int getIDNAType(int cp) {
|
||||
if (cp == '-') return OK;
|
||||
inbuffer.setLength(0);
|
||||
UTF16.append(inbuffer, cp);
|
||||
try {
|
||||
intermediate = IDNA.convertToASCII(inbuffer,
|
||||
IDNA.DEFAULT); // USE_STD3_RULES
|
||||
if (intermediate.length() == 0)
|
||||
return DELETED;
|
||||
outbuffer = IDNA.convertToUnicode(intermediate,
|
||||
IDNA.USE_STD3_RULES);
|
||||
} catch (StringPrepParseException e) {
|
||||
return ILLEGAL;
|
||||
} catch (Exception e) {
|
||||
System.out.println("Failure at: " + Utility.hex(cp));
|
||||
return ILLEGAL;
|
||||
}
|
||||
if (!TestData.equals(inbuffer, outbuffer))
|
||||
return REMAPPED;
|
||||
return OK;
|
||||
}
|
||||
|
||||
}
|
75
tools/unicodetools/com/ibm/text/UCD/InvariantTest.txt
Normal file
75
tools/unicodetools/com/ibm/text/UCD/InvariantTest.txt
Normal file
|
@ -0,0 +1,75 @@
|
|||
Let $letter = [$gc:Lu $gc:Ll $gc:Lt $gc:Lo $gc:Lm];
|
||||
Let $number = [$gc:Nd $gc:Nl $gc:No]
|
||||
Let $mark = [$gc:mn $gc:me $gc:mc]
|
||||
Let $LMN = [$letter $number $mark]
|
||||
Let $gcAllPunctuation = [$gc:Open_Punctuation $gc:Close_Punctuation $gc:Dash_Punctuation $gc:Connector_Punctuation $gc:Other_Punctuation $gc:Initial_Punctuation $gc:Final_Punctuation]
|
||||
Let $gcAllSymbols = [$gc:Currency_Symbol $gc:Modifier_Symbol $gc:Math_Symbol $gc:Other_Symbol]
|
||||
Let $nfc = [^$NFC_Quick_Check:No]
|
||||
|
||||
Show $nfc
|
||||
|
||||
Show [$alphabetic - [$mark $letter $number]]
|
||||
|
||||
|
||||
Let $oldCJK = [\u1100-\u11FF \u3040-\u30FF \u3130-\u318F \u31F0-\u31FF \u3400-\u4DBF \u4E00-\u9FFF \uAC00-\uD7AF \uF900-\uFAFF \uFF65-\uFFDC]
|
||||
|
||||
Show [$oldCJK & $gc:cn]
|
||||
|
||||
Let $fixedOld = [$oldCJK-$gc:cn]
|
||||
|
||||
|
||||
#List the non-alphabetic old items
|
||||
#Show [$oldCJK-$gc:cn-$alphabetic]
|
||||
|
||||
#Check for differences
|
||||
#Test $fixedOld = $trialNew
|
||||
|
||||
#ShowEach $mark
|
||||
|
||||
Let $uax29_outliers = [\u3031-\u3035 \u309B-\u309C \u30A0 \u30FC \uFF70 \uFF9E-\uFF9F]
|
||||
Let $other_outliers = [\u3099-\u309A \u3006 \u303C \u302A-\u302E \u302F \U000E0100-\U000E01EF]
|
||||
|
||||
# ==========================================
|
||||
|
||||
# Outliers from UAX29
|
||||
Show $uax29_outliers
|
||||
|
||||
# Additional outliers
|
||||
Show $other_outliers
|
||||
|
||||
# Take the 5 CJK scripts
|
||||
Let $trialScripts = [$script:hani $script:hang $script:kana $script:hira $script:bopo]
|
||||
|
||||
# Remove the non-LMN
|
||||
Let $trialNewBase = [$trialScripts & $LMN]
|
||||
|
||||
# Add the outliers
|
||||
Let $trialNew = [$trialNewBase $uax29_outliers $other_outliers]
|
||||
|
||||
# Show our result
|
||||
Show $trialNew
|
||||
|
||||
# As a double-check, show script characters we're tossing
|
||||
Show [$trialScripts - $trialNew]
|
||||
|
||||
# Compare snippets stuff
|
||||
Let $guessClose = [$lb:QU $lb:Close_Punctuation]
|
||||
Let $__closing_punc = ["')>\]`\}\u00AB\u00BB\u2018\u2019\u201C\u201D\u2039\u203A\u207E\u208E\u27E7\u27E9\u27EB\u2984\u2986\u2988\u298A\u298C\u298E\u2990\u2992\u2994\u2996\u2998\u29D9\u29DB\u29FD\u3009\u300B\u300D\u300F\u3011\u3015\u3017\u3019\u301B\u301E\u301F\uFD3F\uFE42\uFE44\uFE5A\uFE5C\uFF02\uFF07\uFF09\uFF3D\uFF5D\uFF63]
|
||||
|
||||
$guessClose = $__closing_punc
|
||||
|
||||
Let $guessClose = [$gc:pf $gc:pe $gc:pi]
|
||||
$guessClose = $__closing_punc
|
||||
|
||||
Let $guessTerm = [$sb:aterm $sb:sterm]
|
||||
$guessTerm = [? ? !?? ? ? ? ? ??? ? ? ? ? ? ? ? .?? … ? ? ? ? ? ? ? ?? ? ? ? ? ? ? ?]
|
||||
|
||||
Let $__issymotherr = [\u00A6\u00A7\u06FD\u06FE\u0F01-\u0F03\u0F13-\u0F17\u0F1A-\u0F1F\u0FBE-\u0FC5\u0FC7-\u0FCC\u2100\u2101\u2104-\u2106\u2108\u2109\u2117\u2118\u211E-\u2121\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u2400-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u2613\u2619-\u266E\u2670\u2671\u2701-\u2704\u2706-\u2709\u270C-\u2727\u2729-\u274B\u274F-\u2752\u2758-\u275E\u2761-\u2794\u2798-\u27AF\u27B1-\u27BE\u2800-\u28FF\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB\u3012\u3013\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u3200-\u321C\u322A-\u3243\u3260-\u327B\u328A-\u32B0\u32C0-\u32CB\u32D0-\u32FE\u3300-\u3376\u337B-\u33DD\u33E0-\u33FE\uA490-\uA4A1\uA4A4-\uA4B3\uA4B5-\uA4C0\uA4C2-\uA4C4\uFFED\uFFEE\uFFFC\uFFFD]
|
||||
Let $__issymothers = [\u00B6\u0482\u06E9\u09FA\u0B70\u0F34\u0F36\u0F38\u0FCF\u2114\u2123\u2125\u2127\u2129\u212E\u2132\u213A\u21D3\u220E\u2617\u274D\u2756\u3004\u3020\u327F\uA4C6\uFFE4\uFFE8]
|
||||
|
||||
Let $symOther = [$__issymotherr $__issymothers]
|
||||
|
||||
$symOther = $gcAllSymbols
|
||||
|
||||
|
||||
[$symOther & $nfc] = [$gcAllSymbols & $nfc]
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
|
||||
* $Date: 2005/10/11 19:39:15 $
|
||||
* $Revision: 1.36 $
|
||||
* $Date: 2006/04/05 22:12:44 $
|
||||
* $Revision: 1.37 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -160,8 +160,9 @@ public final class Main implements UCD_Types {
|
|||
//else if (arg.equalsIgnoreCase("TrailingZeros")) GenerateData.genTrailingZeros();
|
||||
else if (arg.equalsIgnoreCase("GenerateThaiBreaks")) GenerateThaiBreaks.main(null);
|
||||
|
||||
else if (arg.equalsIgnoreCase("TestData")) TestData.main(new String[]{args[++i]});
|
||||
|
||||
else if (arg.equalsIgnoreCase("TestData")) TestData.main(new String[]{args[++i]});
|
||||
else if (arg.equalsIgnoreCase("MakeUnicodeFiles")) MakeUnicodeFiles.main(new String[]{});
|
||||
|
||||
//else if (arg.equalsIgnoreCase("checkAgainstUInfo")) checkAgainstUInfo();
|
||||
else if (arg.equalsIgnoreCase("checkScripts")) VerifyUCD.checkScripts();
|
||||
else if (arg.equalsIgnoreCase("IdentifierTest")) VerifyUCD.IdentifierTest();
|
||||
|
|
|
@ -16,6 +16,7 @@ import java.util.regex.Matcher;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodePropertySource;
|
||||
import com.ibm.icu.text.Collator;
|
||||
|
@ -71,7 +72,7 @@ public class MakeNamesChart {
|
|||
System.out.println("file: " + chartPrefix + fileName);
|
||||
PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", chartPrefix + fileName);
|
||||
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>" +
|
||||
BagFormatter.toHTML.transliterate(getHeading(lineParts[2])) +
|
||||
TransliteratorUtilities.toHTML.transliterate(getHeading(lineParts[2])) +
|
||||
"</title><link rel='stylesheet' type='text/css' href='namelist.css'>" +
|
||||
"<base target='names'></head><body>");
|
||||
|
||||
|
@ -117,7 +118,7 @@ public class MakeNamesChart {
|
|||
String hexcp = Utility.hex(it.codepoint, 4);
|
||||
String title = "";
|
||||
String name = Default.ucd().getName(it.codepoint);
|
||||
if (name != null) title = " title='" + BagFormatter.toHTML.transliterate(name.toLowerCase()) + "'";
|
||||
if (name != null) title = " title='" + TransliteratorUtilities.toHTML.transliterate(name.toLowerCase()) + "'";
|
||||
out.println("<td class='" + tdclass + "'"
|
||||
+ title
|
||||
+ ">\u00A0"
|
||||
|
@ -347,7 +348,7 @@ public class MakeNamesChart {
|
|||
static Matcher escapeMatch = Pattern.compile("\\&[A-Z][a-z]*\\;").matcher("");
|
||||
|
||||
private static String showTextConvertingHex(String body, boolean addCharToHex) {
|
||||
body = BagFormatter.toHTML.transliterate(body);
|
||||
body = TransliteratorUtilities.toHTML.transliterate(body);
|
||||
if (addCharToHex) {
|
||||
int position = 0;
|
||||
while (position < body.length()) {
|
||||
|
@ -411,7 +412,7 @@ public class MakeNamesChart {
|
|||
if (type == UCD.Cn || type == UCD.Co || type == UCD.Cs) {
|
||||
return "\u2588";
|
||||
}
|
||||
String result = BagFormatter.toHTML.transliterate(UTF16.valueOf(cp));
|
||||
String result = TransliteratorUtilities.toHTML.transliterate(UTF16.valueOf(cp));
|
||||
if (type == UCD.Me || type == UCD.Mn) {
|
||||
result = "\u25CC" + result;
|
||||
} else if (rtl.contains(cp)) {
|
||||
|
|
|
@ -68,6 +68,7 @@ public class MakeUnicodeFiles {
|
|||
|
||||
public static void main(String[] args) throws IOException {
|
||||
generateFile();
|
||||
System.out.println("DONE");
|
||||
}
|
||||
|
||||
static class Format {
|
||||
|
@ -294,7 +295,7 @@ public class MakeUnicodeFiles {
|
|||
*/
|
||||
try {
|
||||
BufferedReader br =
|
||||
Utility.openReadFile("MakeUnicodeFiles.txt", Utility.UTF8);
|
||||
Utility.openReadFile("com/ibm/text/UCD/MakeUnicodeFiles.txt", Utility.UTF8);
|
||||
String key = null;
|
||||
String file = null, property = null, value = "", comments = "";
|
||||
while (true) {
|
||||
|
@ -594,6 +595,7 @@ public class MakeUnicodeFiles {
|
|||
pw.println(SEPARATOR);
|
||||
pw.println("# Total: " + count);
|
||||
pw.println();
|
||||
pw.println("# EOF");
|
||||
udf.close();
|
||||
}
|
||||
|
||||
|
@ -710,6 +712,8 @@ public class MakeUnicodeFiles {
|
|||
pw.println(line);
|
||||
}
|
||||
}
|
||||
pw.println();
|
||||
pw.println("# EOF");
|
||||
udf.close();
|
||||
}
|
||||
|
||||
|
@ -769,10 +773,16 @@ public class MakeUnicodeFiles {
|
|||
ps.valueStyle = "none";
|
||||
}
|
||||
|
||||
if (ps.noLabel) bf.setLabelSource(null);
|
||||
if (ps.nameStyle.equals("none")) bf.setPropName(null);
|
||||
else if (ps.nameStyle.equals("short")) bf.setPropName(prop.getFirstNameAlias());
|
||||
else bf.setPropName(name);
|
||||
if (ps.noLabel) {
|
||||
bf.setLabelSource(null);
|
||||
}
|
||||
if (ps.nameStyle.equals("none")) {
|
||||
bf.setPropName(null);
|
||||
} else if (ps.nameStyle.equals("short")) {
|
||||
bf.setPropName(prop.getFirstNameAlias());
|
||||
} else {
|
||||
bf.setPropName(name);
|
||||
}
|
||||
|
||||
if (ps.interleaveValues) {
|
||||
writeInterleavedValues(pw, bf, prop, ps);
|
||||
|
@ -784,6 +794,8 @@ public class MakeUnicodeFiles {
|
|||
writeEnumeratedValues(pw, bf, unassigned, prop, ps);
|
||||
}
|
||||
}
|
||||
pw.println();
|
||||
pw.println("# EOF");
|
||||
udf.close();
|
||||
}
|
||||
|
||||
|
@ -809,6 +821,15 @@ public class MakeUnicodeFiles {
|
|||
temp2.addAll(aliases);
|
||||
aliases = temp2;
|
||||
}
|
||||
System.out.println("Check: " + prop.getValue(0xE000));
|
||||
String missing = ps.skipUnassigned != null ? ps.skipUnassigned : ps.skipValue;
|
||||
if (missing != null && !missing.equals("False")) {
|
||||
pw.println();
|
||||
String propName = bf.getPropName();
|
||||
if (propName == null) propName = "";
|
||||
else if (propName.length() != 0) propName = propName + "; ";
|
||||
pw.println("# @missing: 0000..10FFFF; " + propName + missing);
|
||||
}
|
||||
for (Iterator it = aliases.iterator(); it.hasNext();) {
|
||||
String value = (String)it.next();
|
||||
if (DEBUG) System.out.println("Getting value " + value);
|
||||
|
@ -891,6 +912,7 @@ public class MakeUnicodeFiles {
|
|||
pw.println();
|
||||
//if (s.size() != 0)
|
||||
bf.showSetNames(pw, s);
|
||||
//System.out.println(bf.showSetNames(s));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
Generate: NamedSequences
|
||||
Generate: .*
|
||||
DeltaVersion: 14
|
||||
CopyrightYear: 2005
|
||||
CopyrightYear: 2006
|
||||
|
||||
File: auxiliary/GraphemeBreakProperty
|
||||
Property: Grapheme_Cluster_Break
|
||||
|
@ -65,7 +65,10 @@ Value: 4.0
|
|||
# Newly assigned in Unicode 4.0.0 (April, 2003)
|
||||
|
||||
Value: 4.1
|
||||
# Newly assigned in Unicode 4.1.0 (XXX, 2005)
|
||||
# Newly assigned in Unicode 4.1.0 (March, 2005)
|
||||
|
||||
Value: 5.0
|
||||
# Newly assigned in Unicode 5.0.0 (XXX, 2006)
|
||||
|
||||
File: extracted/DerivedBidiClass
|
||||
Property: Bidi_Class
|
||||
|
@ -158,6 +161,10 @@ Property: Grapheme_Base
|
|||
# Note: depending on an application's interpretation of Co (private use),
|
||||
# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither.
|
||||
|
||||
Property: Grapheme_Link
|
||||
# Derived Property: Grapheme_Link (deprecated)
|
||||
# Generated from: Canonical_Combining_Class=Virama
|
||||
# Use Canonical_Combining_Class=Virama directly instead
|
||||
|
||||
File: extracted/DerivedDecompositionType
|
||||
Property: Decomposition_Type
|
||||
|
@ -316,8 +323,6 @@ Property: Noncharacter_Code_Point
|
|||
|
||||
Property: Other_Grapheme_Extend
|
||||
|
||||
Property: Grapheme_Link
|
||||
|
||||
Property: IDS_Binary_Operator
|
||||
|
||||
Property: IDS_Trinary_Operator
|
||||
|
@ -353,7 +358,7 @@ Property: SPECIAL
|
|||
|
||||
File: Scripts
|
||||
Property: Script
|
||||
Format: nameStyle=none skipUnassigned=Common
|
||||
Format: nameStyle=none skipValue=Unknown
|
||||
|
||||
File: SpecialCasing
|
||||
Property: SPECIAL
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
package com.ibm.text.UCD;
|
||||
import com.ibm.icu.impl.CollectionUtilities;
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
import java.util.BitSet;
|
||||
import com.ibm.text.utility.*;
|
||||
import java.io.PrintWriter;
|
||||
|
@ -194,6 +198,7 @@ public final class NFSkippable extends UCDProperty {
|
|||
|
||||
|
||||
PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt", Utility.UTF8_WINDOWS);
|
||||
out.println(Utility.BOM);
|
||||
out.println("NFSafeSets");
|
||||
out.println("Version: " + Default.ucd().getVersion());
|
||||
out.println("Date: " + Default.getDate());
|
||||
|
@ -212,6 +217,8 @@ public final class NFSkippable extends UCDProperty {
|
|||
out.close();
|
||||
}
|
||||
|
||||
static Collator UCA = Collator.getInstance(ULocale.ROOT);
|
||||
|
||||
static void generateSet(PrintWriter out, String label, UCDProperty up) {
|
||||
System.out.println("Generating: " + up.getName(NORMAL));
|
||||
UnicodeSet result = new UnicodeSet();
|
||||
|
@ -227,11 +234,17 @@ public final class NFSkippable extends UCDProperty {
|
|||
out.println(label + " = new UnicodeSet(");
|
||||
writeStringInPieces(out, rSet, ", false);");
|
||||
|
||||
rSet = result.toPattern(false);
|
||||
if (true) {
|
||||
rSet = result.toPattern(false);
|
||||
} else {
|
||||
rSet = CollectionUtilities.prettyPrint(result, true, null, null, UCA, UCA);
|
||||
}
|
||||
|
||||
out.println("/*Unicode: ");
|
||||
writeStringInPieces(out, rSet, "*/");
|
||||
out.println();
|
||||
out.flush();
|
||||
System.out.println("Done");
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -5,30 +5,42 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $
|
||||
* $Date: 2005/11/19 05:39:39 $
|
||||
* $Revision: 1.10 $
|
||||
* $Date: 2006/04/05 22:12:43 $
|
||||
* $Revision: 1.11 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.PrintStream;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.StreamTokenizer;
|
||||
import java.io.StringReader;
|
||||
import java.io.Writer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.StringTokenizer;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import com.ibm.icu.dev.demo.translit.CaseIterator;
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
||||
import com.ibm.icu.dev.test.util.UnicodePropertySource;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap.MapIterator;
|
||||
import com.ibm.icu.impl.PrettyPrinter;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.text.CanonicalIterator;
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.Normalizer;
|
||||
//import com.ibm.icu.text.Normalizer;
|
||||
import com.ibm.icu.text.RuleBasedCollator;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
@ -36,27 +48,27 @@ import com.ibm.icu.text.UnicodeSet;
|
|||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public class QuickTest implements UCD_Types {
|
||||
public static void main(String[] args) throws IOException {
|
||||
try {
|
||||
|
||||
|
||||
checkCase();
|
||||
if (true) return;
|
||||
|
||||
getCaseFoldingUnstable();
|
||||
|
||||
getCaseLengths("Lower", UCD.LOWER);
|
||||
getCaseLengths("Upper", UCD.UPPER);
|
||||
getCaseLengths("Title", UCD.TITLE);
|
||||
getCaseLengths("Fold", UCD.FOLD);
|
||||
|
||||
if (true) return;
|
||||
checkUnicodeSet();
|
||||
getLengths("NFC", Default.nfc());
|
||||
getLengths("NFD", Default.nfd());
|
||||
getLengths("NFKC", Default.nfkc());
|
||||
getLengths("NFKD", Default.nfkd());
|
||||
|
||||
//getCaseFoldingUnstable();
|
||||
|
||||
checkCase();
|
||||
if (true) return;
|
||||
tem();
|
||||
//checkPrettyPrint();
|
||||
|
@ -643,13 +655,13 @@ public class QuickTest implements UCD_Types {
|
|||
if (!text.equals(x)) alpha.put("Lowercase", x);
|
||||
String title = x = UCharacter.toTitleCase(ULocale.ENGLISH,text,null);
|
||||
if (!text.equals(x)) alpha.put("Titlecase", x);
|
||||
String nfc = x = Normalizer.normalize(text,Normalizer.NFC);
|
||||
String nfc = x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFC);
|
||||
if (!text.equals(x)) alpha.put("NFC", x);
|
||||
String nfd = x = Normalizer.normalize(text,Normalizer.NFD);
|
||||
String nfd = x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFD);
|
||||
if (!text.equals(x)) alpha.put("NFD", x);
|
||||
x = Normalizer.normalize(text,Normalizer.NFKD);
|
||||
x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFKD);
|
||||
if (!text.equals(x)) alpha.put("NFKD", x);
|
||||
x = Normalizer.normalize(text,Normalizer.NFKC);
|
||||
x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFKC);
|
||||
if (!text.equals(x)) alpha.put("NFKC", x);
|
||||
|
||||
CanonicalIterator ci = new CanonicalIterator(text);
|
||||
|
|
|
@ -70,3 +70,6 @@
|
|||
# Note: the following case is already in the UnicodeData file.
|
||||
|
||||
# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
|
||||
|
||||
# EOF
|
||||
|
||||
|
|
|
@ -31,9 +31,10 @@
|
|||
# A locale ID is defined by taking any language tag as defined by
|
||||
# RFC 3066 (or its successor), and replacing '-' by '_'.
|
||||
#
|
||||
# A context for a character C is defined by Section 3.13 Default Case Operations,
|
||||
# on p. 89-90 of The Unicode Standard, Version 4.0, as amended by Unicode 4.1.0,
|
||||
# as specified in http://www.unicode.org/versions/Unicode4.1.0/
|
||||
# A context for a character C is defined by Section 3.13 Default Case
|
||||
# Operations, of The Unicode Standard, Version 5.0.
|
||||
# (This is identical to the context defined by Unicode 4.1.0,
|
||||
# as specified in http://www.unicode.org/versions/Unicode4.1.0/)
|
||||
#
|
||||
# Parsers of this file must be prepared to deal with future additions to this format:
|
||||
# * Additional contexts
|
||||
|
|
|
@ -1,13 +1,10 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
|
||||
|
||||
"http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
|
||||
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta http-equiv="Content-Language" content="en-us">
|
||||
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
|
||||
<meta name="GENERATOR" content="Microsoft FrontPage 5.0">
|
||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
||||
<meta name="keywords" content="unicode, variant glyphs">
|
||||
<meta name="description" content="Describes and displays standardized variant glyphs">
|
||||
|
@ -19,8 +16,9 @@
|
|||
|
||||
<table class="header">
|
||||
<tr>
|
||||
<td class="icon"><a href="http://www.unicode.org"><img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a> <a class="bar" href="http://www.unicode.org/ucd">Unicode
|
||||
Character Database</a></td>
|
||||
<td class="icon"><a href="http://www.unicode.org">
|
||||
<img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a> <a class="bar" href="http://www.unicode.org/ucd">Unicode
|
||||
Character Database</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="gray"> </td>
|
||||
|
@ -29,105 +27,78 @@
|
|||
<blockquote>
|
||||
<h1>Standardized Variants</h1>
|
||||
<table class="wide">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td valign="top" width="144">Revision</td>
|
||||
<td valign="top">@revision@</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Authors</td>
|
||||
<td valign="top">Members of the Editorial Committee</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Date</td>
|
||||
<td valign="top">@date@</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">This Version</td>
|
||||
<td valign="top"><a href="http://www.unicode.org/Public/@updateDirectory@/StandardizedVariants-@revision@.html">http://www.unicode.org/Public/@updateDirectory@/StandardizedVariants-@revision@.html</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Previous Version</td>
|
||||
<td valign="top"><a href="http://www.unicode.org/Public/3.2-Update/StandardizedVariants-3.2.0.html">http://www.unicode.org/Public/3.2-Update/StandardizedVariants-3.2.0.html</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Latest Version</td>
|
||||
<td valign="top"><a href="http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html">http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html</a></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
<tr>
|
||||
<td valign="top" width="144">Revision</td>
|
||||
<td valign="top">@revision@</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Authors</td>
|
||||
<td valign="top">Members of the Editorial Committee</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Date</td>
|
||||
<td valign="top">@date@</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">This Version</td>
|
||||
<td valign="top"><a href="http://www.unicode.org/Public/@updateDirectory@/@filename@.html">
|
||||
http://www.unicode.org/Public/@updateDirectory@/@filename@.html</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Previous Version</td>
|
||||
<td valign="top"><a href="http://www.unicode.org/Public/4.1.0/ucd/StandardizedVariants.html">
|
||||
http://www.unicode.org/Public/4.1.0/ucd/StandardizedVariants.html</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top" width="144">Latest Version</td>
|
||||
<td valign="top"><a href="http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html">
|
||||
http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html</a></td>
|
||||
</tr>
|
||||
</table>
|
||||
<h3><br>
|
||||
<i>Summary</i></h3>
|
||||
<blockquote>
|
||||
<p>This file provides a visual display of the standard variant sequences
|
||||
derived from StandardizedVariants.txt.</p>
|
||||
<p>This file provides a visual display of the standard variant sequences derived from
|
||||
StandardizedVariants.txt.</p>
|
||||
</blockquote>
|
||||
<h3><i>Status</i></h3>
|
||||
<blockquote>
|
||||
<p><i>The file and the files described herein are part of the <a href="http://www.unicode.org/ucd">Unicode
|
||||
Character Database</a> (UCD) and are governed by the <a href="#Terms of Use">UCD
|
||||
Terms of Use</a> stated at the end.</i></p>
|
||||
<p><i>This file and the files described herein are part of the Unicode Character Database and
|
||||
are governed by the terms of use at <a href="http://www.unicode.org/terms_of_use.html">
|
||||
http://www.unicode.org/terms_of_use.html</a>.</i></p>
|
||||
</blockquote>
|
||||
<hr width="50%">
|
||||
<h2>Introduction</h2>
|
||||
<p>The tables here <i>exhaustively</i> lists the valid, registered
|
||||
combinations of base character plus variation indicator. All combinations not
|
||||
listed in StandardizedVariants.txt are unspecified and are reserved for future
|
||||
standardization; no conformant process may interpret them as standardized
|
||||
variants. Variation selectors and their use are described in The Unicode
|
||||
Standard.</p>
|
||||
<p>These mathematical variants are all produced with the addition of Variation
|
||||
Selector 1 (VS1 or U+FE00) to mathematical operator base characters. There is
|
||||
no variation according to context. The Mongolian variants use the Mongolian
|
||||
Variant Selectors, and may vary according to context. That is, if a contextual
|
||||
shape is not listed below, then the variation sequence has an unmodified
|
||||
<p>The tables here <i>exhaustively</i> lists the valid, registered combinations of base character
|
||||
plus variation indicator. All combinations not listed in StandardizedVariants.txt are unspecified
|
||||
and are reserved for future standardization; no conformant process may interpret them as
|
||||
standardized variants. Variation selectors and their use are described in The Unicode Standard.</p>
|
||||
<p>These mathematical variants are all produced with the addition of Variation Selector 1 (VS1 or
|
||||
U+FE00) to mathematical operator base characters. There is no variation according to context. The
|
||||
Mongolian variants use the Mongolian Variant Selectors, and may vary according to context. That
|
||||
is, if a contextual shape is not listed below, then the variation sequence has an unmodified
|
||||
appearance. At this time no Han variants exist.</p>
|
||||
<blockquote>
|
||||
<p><a name="fonts"><b>Note: </b></a>The glyphs used to show the variations
|
||||
are often derived from different physical fonts than the representative
|
||||
glyphs in the standard. They may therefore exhibit minor differences in
|
||||
size, proportion, or weight <i>unrelated</i> to the intentional difference
|
||||
in feature that is the defining element of the variation. Such minor
|
||||
differences should be ignored. Likewise, in some cases the existing
|
||||
representative fonts may not yet contain newly encoded characters and hence
|
||||
some representative glyphs shown in these tables may have a slightly
|
||||
different style than others.</p>
|
||||
<p><a name="fonts"><b>Note: </b></a>The glyphs used to show the variations are often derived
|
||||
from different physical fonts than the representative glyphs in the standard. They may therefore
|
||||
exhibit minor differences in size, proportion, or weight <i>unrelated</i> to the intentional
|
||||
difference in feature that is the defining element of the variation. Such minor differences
|
||||
should be ignored. Likewise, in some cases the existing representative fonts may not yet contain
|
||||
newly encoded characters and hence some representative glyphs shown in these tables may have a
|
||||
slightly different style than others.</p>
|
||||
</blockquote>
|
||||
<p>@table@</p>
|
||||
<hr width="50%">
|
||||
<h2>UCD <a name="Terms of Use">Terms of Use</a></h2>
|
||||
<h3><i>Disclaimer</i></h3>
|
||||
<blockquote>
|
||||
<p><i>The Unicode Character Database is provided as is by Unicode, Inc. No
|
||||
claims are made as to fitness for any particular purpose. No warranties of
|
||||
any kind are expressed or implied. The recipient agrees to determine
|
||||
applicability of information provided. If this file has been purchased on
|
||||
magnetic or optical media from Unicode, Inc., the sole remedy for any claim
|
||||
will be exchange of defective media within 90 days of receipt.</i></p>
|
||||
<p><i>This disclaimer is applicable for all other data files accompanying
|
||||
the Unicode Character Database, some of which have been compiled by the
|
||||
Unicode Consortium, and some of which have been supplied by other sources.</i></p>
|
||||
</blockquote>
|
||||
<h3><i>Limitations on Rights to Redistribute This Data</i></h3>
|
||||
<blockquote>
|
||||
<p><i>Recipient is granted the right to make copies in any form for internal
|
||||
distribution and to freely use the information supplied in the creation of
|
||||
products supporting the Unicode<sup>TM</sup> Standard. The files in the
|
||||
Unicode Character Database can be redistributed to third parties or other
|
||||
organizations (whether for profit or not) as long as this notice and the
|
||||
disclaimer notice are retained. Information can be extracted from these
|
||||
files and used in documentation or programs, as long as there is an
|
||||
accompanying notice indicating the source.</i></p>
|
||||
</blockquote>
|
||||
<hr width="50%">
|
||||
<div align="center">
|
||||
<center>
|
||||
<table cellspacing="0" cellpadding="0" border="0">
|
||||
<tr>
|
||||
<td><a href="http://www.unicode.org/unicode/copyright.html"><img src="http://www.unicode.org/img/hb_notice.gif" border="0" alt="Access to Copyright and terms of use" width="216" height="50"></a></td>
|
||||
<td><a href="http://www.unicode.org/unicode/copyright.html">
|
||||
<img src="http://www.unicode.org/img/hb_notice.gif" border="0" alt="Access to Copyright and terms of use" width="216" height="50"></a></td>
|
||||
</tr>
|
||||
</table>
|
||||
<script language="Javascript" type="text/javascript" src="http://www.unicode.org/webscripts/lastModified.js"></script>
|
||||
<script language="Javascript" type="text/javascript" src="http://www.unicode.org/webscripts/lastModified.js">
|
||||
</script>
|
||||
</center>
|
||||
</div>
|
||||
</blockquote>
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
|
||||
* $Date: 2005/11/19 05:39:39 $
|
||||
* $Revision: 1.24 $
|
||||
* $Date: 2006/04/05 22:12:43 $
|
||||
* $Revision: 1.25 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -27,6 +27,7 @@ import com.ibm.icu.impl.CollectionUtilities;
|
|||
import com.ibm.icu.impl.ICUData;
|
||||
import com.ibm.icu.impl.ICUResourceBundle;
|
||||
import com.ibm.icu.impl.UCharArrayIterator;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.NumberFormat;
|
||||
import com.ibm.icu.text.StringPrep;
|
||||
import com.ibm.icu.text.StringPrepParseException;
|
||||
|
@ -45,8 +46,17 @@ public class TestData implements UCD_Types {
|
|||
static UnicodeProperty.Factory upf;
|
||||
|
||||
public static void main (String[] args) throws IOException {
|
||||
//checkChars(false);
|
||||
|
||||
tryConsole2();
|
||||
if (true) return;
|
||||
|
||||
showNonCompatFull(false);
|
||||
showNonCompatFull(true);
|
||||
|
||||
|
||||
checkForCaseStability(false);
|
||||
//countChars();
|
||||
foo();
|
||||
|
||||
System.out.println("main: " + Default.getDate());
|
||||
upf = ICUPropertyFactory.make();
|
||||
System.out.println("after factory: " + Default.getDate());
|
||||
|
@ -146,8 +156,152 @@ public class TestData implements UCD_Types {
|
|||
}
|
||||
} finally {
|
||||
log.close();
|
||||
}
|
||||
}
|
||||
|
||||
private static void showNonCompatFull(boolean compat) {
|
||||
UCD ucd = UCD.make("4.1.0");
|
||||
Normalizer nfkc = new Normalizer(Normalizer.NFKC, ucd.getVersion());
|
||||
System.out.println();
|
||||
System.out.println(compat ? "Full Fold = Simple Lower of NFKC" : "Full Fold != Simple Lower of NFKC");
|
||||
System.out.println();
|
||||
int count = 0;
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
int gc = ucd.getCategory(i);
|
||||
if (gc == Cn || gc == PRIVATE_USE) continue;
|
||||
//if (compat == (ucd.getDecompositionType(i) > UCD.CANONICAL)) continue;
|
||||
String str = UTF16.valueOf(i);
|
||||
String simpleLower = ucd.getCase(str, SIMPLE, LOWER);
|
||||
String fullFold = ucd.getCase(str, FULL, FOLD);
|
||||
|
||||
if (!simpleLower.equals(fullFold)) {
|
||||
String nfkcStr = nfkc.normalize(str);
|
||||
String simpleLowerNfkc = ucd.getCase(nfkcStr, SIMPLE, LOWER);
|
||||
if (compat != (fullFold.equals(simpleLowerNfkc))) continue;
|
||||
System.out.println(ucd.getCodeAndName(i));
|
||||
System.out.println("\tSimple Lower:\t" + ucd.getCodeAndName(simpleLower));
|
||||
System.out.println("\tFull Fold:\t" + ucd.getCodeAndName(fullFold));
|
||||
count++;
|
||||
}
|
||||
}
|
||||
System.out.println("Count:\t" + count);
|
||||
}
|
||||
|
||||
private static void tryConsole() throws UnsupportedEncodingException {
|
||||
for (int i = 1; i < 0xFFFF; ++i) {
|
||||
String s = UTF32.valueOf32(i);
|
||||
byte[] bytes = s.getBytes("UTF-8");
|
||||
String utf8bytes = "";
|
||||
for (int j = 0; j < bytes.length; ++j) {
|
||||
if (j != 0) utf8bytes += " ";
|
||||
utf8bytes += Utility.hex(bytes[j]&0xFF,2);
|
||||
}
|
||||
String name = UCharacter.getExtendedName(i);
|
||||
System.out.println(Utility.hex(i) + "\t(" + s + ")\t[" + utf8bytes + "]\t" + name);
|
||||
}
|
||||
}
|
||||
|
||||
private static void tryConsole2() throws UnsupportedEncodingException {
|
||||
UnicodeSet failures = new UnicodeSet();
|
||||
check:
|
||||
for (int i = 1; i <= 0x10FFFF; ++i) {
|
||||
String s = UTF32.valueOf32(i);
|
||||
byte[] bytes = s.getBytes("UTF-8");
|
||||
for (int j = 0; j < bytes.length; ++j) {
|
||||
switch (bytes[j]&0xFF) {
|
||||
case 0x81: case 0x8D: case 0x8F: case 0x90: case 0x9D:
|
||||
failures.add(i);
|
||||
continue check;
|
||||
}
|
||||
}
|
||||
}
|
||||
System.out.println("Total corrupted characters: " + failures.size());
|
||||
System.out.println("Percent corrupted characters: " + ((failures.size() + 0.0) / 0x110000 * 100.0 + "%"));
|
||||
//BagFormatter bf = new BagFormatter();
|
||||
//System.out.println(bf.showSetNames(failures));
|
||||
}
|
||||
|
||||
|
||||
private static void countChars() {
|
||||
int[][] count = new int[AGE_VERSIONS.length][50];
|
||||
for (int j = 1; j < AGE_VERSIONS.length; ++j) {
|
||||
UCD ucd = UCD.make(AGE_VERSIONS[j]);
|
||||
UCDProperty alpha = DerivedProperty.make(ucd.PropAlphabetic, ucd);
|
||||
|
||||
int alphaCount = 0;
|
||||
for (int i = 0; i <=0x10FFFF; ++i) {
|
||||
int type = ucd.getCategory(i);
|
||||
if (ucd.isNoncharacter(i)) type = LIMIT_CATEGORY;
|
||||
++count[j][type];
|
||||
if (alpha.hasValue(i) || type == ucd.Nd) ++count[j][LIMIT_CATEGORY+1];
|
||||
}
|
||||
}
|
||||
|
||||
for (byte i = -1; i < LIMIT_CATEGORY+2; ++i) {
|
||||
switch(i) {
|
||||
case -1: System.out.print("\t\t"); break;
|
||||
default: System.out.print(UCD.getCategoryID_fromIndex(i,UCD.LONG) + "\t" + UCD.getCategoryID_fromIndex(i)); break;
|
||||
case LIMIT_CATEGORY: System.out.print("Noncharacter" + "\t" + "NCCP"); break;
|
||||
case LIMIT_CATEGORY+1: System.out.print("Alphabetic" + "\t" + "alpha"); break;
|
||||
}
|
||||
for (int j = 1; j < AGE_VERSIONS.length; ++j) {
|
||||
if (i < 0) System.out.print("\t*" + AGE_VERSIONS[j] + "*");
|
||||
else System.out.print("\t" + count[j][i]);
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void foo() {
|
||||
String[] test = {
|
||||
"vicepresident",
|
||||
"vice president",
|
||||
"vice-president",
|
||||
"vice-président",
|
||||
"vice-president's offices",
|
||||
"vice-presidents' offices",
|
||||
"vice-presidents offices",
|
||||
"vice-presidentsoffices",
|
||||
};
|
||||
RuleBasedCollator col = (RuleBasedCollator) Collator.getInstance(new ULocale("fr"));
|
||||
col.setStrength(col.QUATERNARY);
|
||||
col.setAlternateHandlingShifted(false);
|
||||
|
||||
Arrays.sort(test, col);
|
||||
List s = Arrays.asList(test);
|
||||
String last = "";
|
||||
int[] level = new int[1];
|
||||
for (Iterator it = s.iterator(); it.hasNext();) {
|
||||
String current = (String) it.next();
|
||||
int order = levelCompare(col, last, current, level);
|
||||
//System.out.print(levelStrings[level[0]]);
|
||||
//System.out.print(order < 0 ? "<" : order == 0 ? "=" : ">");
|
||||
System.out.println("\t" + current);
|
||||
last = current;
|
||||
}
|
||||
for (int i = 0; i < test.length; ++i) {
|
||||
System.out.print(test[i] + ";");
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
static String[] levelStrings = {".", "..", "...", "....", "....."};
|
||||
|
||||
static int levelCompare(RuleBasedCollator col, String a, String b, int[] level) {
|
||||
int diff = 0;
|
||||
level[0] = 0;
|
||||
for (int i = 0; i < 15; ++i) {
|
||||
col.setStrength(i);
|
||||
diff = col.compare(a, b);
|
||||
if (diff != 0) {
|
||||
level[0] = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
Matcher m;
|
||||
|
||||
/**
|
||||
|
@ -163,12 +317,12 @@ public class TestData implements UCD_Types {
|
|||
return true;
|
||||
}
|
||||
|
||||
private static void checkChars(boolean mergeRanges) {
|
||||
private static void checkForCaseStability(boolean mergeRanges) {
|
||||
UCD ucd = Default.ucd();
|
||||
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
|
||||
UnicodeSet isUpper = ups.getSet("Uppercase=true");
|
||||
UnicodeSet isLower = ups.getSet("Lowercase=true");
|
||||
UnicodeSet isTitle = ups.getSet("gc=Lt");
|
||||
UnicodeSet propUppercase = ups.getSet("Uppercase=true");
|
||||
UnicodeSet propLowercase = ups.getSet("Lowercase=true");
|
||||
UnicodeSet isGcLt = ups.getSet("gc=Lt");
|
||||
UnicodeSet otherAlphabetic = ups.getSet("Alphabetic=true").addAll(ups.getSet("gc=Sk"));
|
||||
// create the following
|
||||
UnicodeSet hasFold = new UnicodeSet();
|
||||
|
@ -177,6 +331,10 @@ public class TestData implements UCD_Types {
|
|||
UnicodeSet hasTitle = new UnicodeSet();
|
||||
UnicodeSet compat = new UnicodeSet();
|
||||
UnicodeSet bicameralsScripts = new UnicodeSet();
|
||||
|
||||
UnicodeSet isFUppercase = new UnicodeSet();
|
||||
UnicodeSet isFLowercase = new UnicodeSet();
|
||||
UnicodeSet isFTitlecase = new UnicodeSet();
|
||||
|
||||
UCD u40 = UCD.make("4.0.0");
|
||||
BitSet scripts = new BitSet();
|
||||
|
@ -184,41 +342,83 @@ public class TestData implements UCD_Types {
|
|||
int gc = ucd.getCategory(i);
|
||||
if (gc == Cn || gc == PRIVATE_USE) continue;
|
||||
String str = UTF16.valueOf(i);
|
||||
if (!str.equals(ucd.getCase(str, FULL, FOLD))) hasFold.add(i);
|
||||
if (!str.equals(ucd.getCase(str, FULL, UPPER))) hasUpper.add(i);
|
||||
if (!str.equals(ucd.getCase(str, FULL, FOLD))) {
|
||||
hasFold.add(i);
|
||||
scripts.set(ucd.getScript(i));
|
||||
}
|
||||
if (!str.equals(ucd.getCase(str, FULL, UPPER))) {
|
||||
hasUpper.add(i);
|
||||
scripts.set(ucd.getScript(i));
|
||||
} else {
|
||||
isFUppercase.add(i);
|
||||
}
|
||||
if (!str.equals(ucd.getCase(str, FULL, LOWER))) {
|
||||
hasLower.add(i);
|
||||
scripts.set(ucd.getScript(i));
|
||||
} else {
|
||||
isFLowercase.add(i);
|
||||
}
|
||||
if (!str.equals(ucd.getCase(str, FULL, TITLE))) {
|
||||
hasTitle.add(i);
|
||||
scripts.set(ucd.getScript(i));
|
||||
} else {
|
||||
isFTitlecase.add(i);
|
||||
}
|
||||
if (!str.equals(ucd.getCase(str, FULL, TITLE))) hasTitle.add(i);
|
||||
if (!str.equals(Default.nfkd().normalize(str))) compat.add(i);
|
||||
//System.out.println(ucd.getCodeAndName(i) + "\t" + (u40.isAllocated(i) ? "already in 4.0" : "new in 4.1"));
|
||||
}
|
||||
BagFormatter bf = new BagFormatter();
|
||||
Transliterator nullTrans = Transliterator.getInstance("null");
|
||||
bf.setShowLiteral(nullTrans);
|
||||
bf.setMergeRanges(mergeRanges);
|
||||
bf.setUnicodePropertyFactory(ups);
|
||||
printItems(bf, compat, "isUpper or isTitle without hasLower",
|
||||
new UnicodeSet(isUpper).addAll(isTitle).removeAll(hasLower));
|
||||
printItems(bf, compat, "hasLower, but not isUpper or isTitle",
|
||||
new UnicodeSet(hasLower).removeAll(isTitle).removeAll(isUpper));
|
||||
printItems(bf, compat, "isLower without hasUpper",
|
||||
new UnicodeSet(isLower).addAll(isTitle).removeAll(hasUpper));
|
||||
printItems(bf, compat, "hasUpper, but not isLower or isTitle",
|
||||
new UnicodeSet(hasUpper).removeAll(isTitle).removeAll(isLower));
|
||||
|
||||
UnicodeSet allCased = new UnicodeSet().addAll(hasUpper).addAll(hasLower).addAll(hasTitle);
|
||||
isFUppercase.retainAll(allCased);
|
||||
isFLowercase.retainAll(allCased);
|
||||
isFTitlecase.retainAll(allCased);
|
||||
System.out.println(Utility.BOM);
|
||||
|
||||
printItems(bf, compat, "Uppercase=true or gc=Lt without hasLower",
|
||||
new UnicodeSet(propUppercase).addAll(isGcLt).removeAll(hasLower));
|
||||
printItems(bf, compat, "hasLower, but not (Uppercase=true or gc=Lt)",
|
||||
new UnicodeSet(hasLower).removeAll(isGcLt).removeAll(propUppercase));
|
||||
printItems(bf, compat, "Lowercase=true without hasUpper",
|
||||
new UnicodeSet(propLowercase).addAll(isGcLt).removeAll(hasUpper));
|
||||
printItems(bf, compat, "hasUpper, but not (Lowercase=true or gc=Lt)",
|
||||
new UnicodeSet(hasUpper).removeAll(isGcLt).removeAll(propLowercase));
|
||||
|
||||
|
||||
printItems(bf, compat, "Functionally Uppercase, but not Uppercase=true",
|
||||
new UnicodeSet(isFUppercase).removeAll(propUppercase));
|
||||
printItems(bf, compat, "Uppercase=true, but not functionally Uppercase",
|
||||
new UnicodeSet(propUppercase).removeAll(isFUppercase));
|
||||
|
||||
printItems(bf, compat, "Functionally Lowercase, but not Lowercase=true",
|
||||
new UnicodeSet(isFLowercase).removeAll(propLowercase));
|
||||
printItems(bf, compat, "Lowercase=true, but not functionally Lowercase",
|
||||
new UnicodeSet(propLowercase).removeAll(isFLowercase));
|
||||
|
||||
|
||||
UnicodeSet scriptSet = new UnicodeSet();
|
||||
UnicodeProperty scriptProp = ups.getProperty("Script");
|
||||
bf.setMergeRanges(true);
|
||||
System.out.println();
|
||||
System.out.println("Bicameral Scripts: those with at least one functionally cased character.");
|
||||
System.out.println();
|
||||
for (int i = 0; i < scripts.size(); ++i) {
|
||||
if (!scripts.get(i)) continue;
|
||||
if (i == COMMON_SCRIPT) continue;
|
||||
//if (i == COMMON_SCRIPT) continue;
|
||||
String scriptName = ucd.getScriptID_fromIndex((byte)i);
|
||||
System.out.println(scriptName);
|
||||
scriptSet.addAll(scriptProp.getSet(scriptName));
|
||||
UnicodeSet scriptUSet = scriptProp.getSet(scriptName);
|
||||
scriptSet.addAll(scriptUSet);
|
||||
printItems(bf, compat, "Bicameral Script: " + scriptName,
|
||||
new UnicodeSet(allCased).retainAll(scriptUSet));
|
||||
}
|
||||
UnicodeSet allCased = new UnicodeSet().addAll(isUpper).addAll(isLower).addAll(isTitle);
|
||||
printItems(bf, compat, "(Bicameral) isAlpha or Symbol Modifier, but not isCased",
|
||||
bf.setMergeRanges(false);
|
||||
printItems(bf, compat, "Bicameral Script: isAlpha or Symbol Modifier, but not isCased",
|
||||
new UnicodeSet(scriptSet).retainAll(otherAlphabetic).removeAll(allCased));
|
||||
printItems(bf, compat, "(Bicameral) isCased, but not isAlpha or Symbol Modifier",
|
||||
printItems(bf, compat, "Bicameral Script: isCased, but not isAlpha or Symbol Modifier",
|
||||
new UnicodeSet(scriptSet).retainAll(allCased).removeAll(otherAlphabetic));
|
||||
}
|
||||
|
||||
|
@ -302,21 +502,21 @@ public class TestData implements UCD_Types {
|
|||
}
|
||||
}
|
||||
|
||||
public static class RegexMatcher implements UnicodeProperty.Matcher {
|
||||
public static class RegexMatcher implements UnicodeProperty.PatternMatcher {
|
||||
private Matcher matcher;
|
||||
|
||||
public UnicodeProperty.Matcher set(String pattern) {
|
||||
public UnicodeProperty.PatternMatcher set(String pattern) {
|
||||
matcher = Pattern.compile(pattern).matcher("");
|
||||
return this;
|
||||
}
|
||||
public boolean matches(String value) {
|
||||
matcher.reset(value);
|
||||
public boolean matches(Object value) {
|
||||
matcher.reset((String)value);
|
||||
return matcher.matches();
|
||||
}
|
||||
}
|
||||
|
||||
static BagFormatter bf = new BagFormatter();
|
||||
static UnicodeProperty.Matcher matcher = new RegexMatcher();
|
||||
static UnicodeProperty.PatternMatcher matcher = new RegexMatcher();
|
||||
|
||||
private static void showPropDiff(String p1, UnicodeSet s1, String p2, UnicodeSet s2) {
|
||||
System.out.println("Property Listing");
|
||||
|
|
|
@ -26,8 +26,16 @@ public class TestIdentifiers {
|
|||
public static void main(String[] args) throws IOException {
|
||||
String[] tests = { "SØS", "façade", "MOPE", "VOP", "scope", "ibm", "vop",
|
||||
"toys-я-us", "1iνе", "back", "boгing" };
|
||||
|
||||
TestIdentifiers ti = new TestIdentifiers("L");
|
||||
TestIdentifiers tiany = new TestIdentifiers("A");
|
||||
ti.loadIdentifiers();
|
||||
UnicodeSet idnCharSet = ti.idnChars.getSet("output", new UnicodeSet());
|
||||
System.out.println("idnCharSet: " + idnCharSet.size());
|
||||
UnicodeSet idnCharNonStarting = ti.nonstarting;
|
||||
System.out.println("idnCharNonStarting: " + idnCharSet);
|
||||
if (true) return;
|
||||
|
||||
for (int i = 0; i < tests.length; ++i) {
|
||||
System.out.print(tests[i]);
|
||||
String folded = UCharacter.foldCase(tests[i], true);
|
||||
|
|
|
@ -10,6 +10,7 @@ import java.util.List;
|
|||
import java.util.Locale;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.tool.UOption;
|
||||
import com.ibm.icu.text.SymbolTable;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeMatcher;
|
||||
|
@ -17,9 +18,26 @@ import com.ibm.icu.text.UnicodeSet;
|
|||
import com.ibm.text.utility.Utility;
|
||||
|
||||
public class TestUnicodeInvariants {
|
||||
private static final int
|
||||
HELP1 = 0,
|
||||
FILE = 1,
|
||||
RANGE = 2
|
||||
;
|
||||
|
||||
private static final UOption[] options = {
|
||||
UOption.HELP_H(),
|
||||
UOption.create("file", 'f', UOption.REQUIRES_ARG),
|
||||
UOption.create("range", 'r', UOption.NO_ARG),
|
||||
};
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
testInvariants();
|
||||
UOption.parseArgs(args, options);
|
||||
|
||||
String file = "UnicodeInvariants.txt";
|
||||
if (options[FILE].doesOccur) file = options[FILE].value;
|
||||
boolean doRange = options[RANGE].doesOccur;
|
||||
|
||||
testInvariants(file, doRange);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -68,19 +86,19 @@ public class TestUnicodeInvariants {
|
|||
|
||||
static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[\\~ \\= \\! \\? \\< \\> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]");
|
||||
|
||||
public static void testInvariants() throws IOException {
|
||||
public static void testInvariants(String outputFile, boolean doRange) throws IOException {
|
||||
String[][] variables = new String[100][2];
|
||||
int variableCount = 0;
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
|
||||
out.write('\uFEFF'); // BOM
|
||||
BufferedReader in = BagFormatter.openUTF8Reader("com/ibm/text/UCD/", "UnicodeInvariants.txt");
|
||||
BufferedReader in = BagFormatter.openUTF8Reader("com/ibm/text/UCD/", outputFile);
|
||||
BagFormatter bf = new BagFormatter();
|
||||
bf.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
|
||||
BagFormatter bf2 = new BagFormatter();
|
||||
bf2.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
|
||||
bf2.setMergeRanges(false);
|
||||
bf2.setMergeRanges(doRange);
|
||||
ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] {
|
||||
ToolUnicodePropertySource.make("4.0.0").getSymbolTable("\u00D7"),
|
||||
ToolUnicodePropertySource.make(UCD.lastVersion).getSymbolTable("\u00D7"),
|
||||
ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")});
|
||||
ParsePosition pp = new ParsePosition(0);
|
||||
int parseErrorCount = 0;
|
||||
|
@ -113,11 +131,20 @@ public class TestUnicodeInvariants {
|
|||
// detect variables
|
||||
if (line.startsWith("Show")) {
|
||||
String part = line.substring(4).trim();
|
||||
if (part.startsWith("Each")) {
|
||||
part = part.substring(4).trim();
|
||||
bf2.setMergeRanges(false);
|
||||
}
|
||||
pp.setIndex(0);
|
||||
UnicodeSet leftSet = new UnicodeSet(part, pp, st);
|
||||
bf2.showSetNames(out, leftSet);
|
||||
bf2.setMergeRanges(doRange);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (line.startsWith("Test")) {
|
||||
line = line.substring(4).trim();
|
||||
}
|
||||
|
||||
char relation = 0;
|
||||
String rightSide = null;
|
||||
|
@ -166,7 +193,7 @@ public class TestUnicodeInvariants {
|
|||
|
||||
boolean ok = true;
|
||||
switch(relation) {
|
||||
case '=': ok = leftSet.equals(rightSet); break;
|
||||
case '=': case '\u2261': ok = leftSet.equals(rightSet); break;
|
||||
case '<': case '\u2282': ok = rightSet.containsAll(leftSet) && !leftSet.equals(rightSet); break;
|
||||
case '>': case '\u2283': ok = leftSet.containsAll(rightSet) && !leftSet.equals(rightSet); break;
|
||||
case '\u2264': case '\u2286': ok = rightSet.containsAll(leftSet); break;
|
||||
|
|
|
@ -254,18 +254,19 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
|
|||
, "Katakana");
|
||||
Object foo = unicodeMap.getSet("Katakana");
|
||||
UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true");
|
||||
UnicodeProperty lineBreak = getProperty("Line_Break");
|
||||
unicodeMap.putAll(getProperty("Alphabetic").getSet("true")
|
||||
.add(0xA0).add(0x05F3)
|
||||
.removeAll(getProperty("Ideographic").getSet("true"))
|
||||
.removeAll(unicodeMap.getSet("Katakana"))
|
||||
.removeAll(script.getSet("Thai"))
|
||||
.removeAll(script.getSet("Lao"))
|
||||
//.removeAll(script.getSet("Thai"))
|
||||
//.removeAll(script.getSet("Lao"))
|
||||
.removeAll(lineBreak.getSet("SA"))
|
||||
.removeAll(script.getSet("Hiragana"))
|
||||
.removeAll(graphemeExtend),
|
||||
"ALetter");
|
||||
unicodeMap.putAll(new UnicodeSet("[\\u0027\\u00B7\\u05F4\\u2019\\u2027\\u003A]")
|
||||
,"MidLetter");
|
||||
UnicodeProperty lineBreak = getProperty("Line_Break");
|
||||
unicodeMap.putAll(lineBreak.getSet("Infix_Numeric")
|
||||
.remove(0x003A), "MidNum");
|
||||
unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric");
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2005/11/01 00:10:54 $
|
||||
* $Revision: 1.40 $
|
||||
* $Date: 2006/04/05 22:12:44 $
|
||||
* $Revision: 1.41 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -43,7 +43,8 @@ public final class UCD implements UCD_Types {
|
|||
/**
|
||||
* Used for the default version.
|
||||
*/
|
||||
public static final String latestVersion = "5.1.0";
|
||||
public static final String latestVersion = "5.0.0";
|
||||
public static final String lastVersion = "4.1.0";
|
||||
|
||||
/**
|
||||
* Create singleton instance for default (latest) version
|
||||
|
@ -803,6 +804,9 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
|
||||
public byte getScript(int codePoint) {
|
||||
if (codePoint == 0xE000) {
|
||||
codePoint += 0;
|
||||
}
|
||||
return get(codePoint, false).script;
|
||||
}
|
||||
|
||||
|
@ -1398,6 +1402,7 @@ to guarantee identifier closure.
|
|||
}
|
||||
if (isHangul) {
|
||||
if (fixStrings) result.decompositionMapping = getHangulDecompositionPair(codePoint);
|
||||
if (isLV(codePoint)) result.lineBreak = LB_H2; else result.lineBreak = LB_H3;
|
||||
result.decompositionType = CANONICAL;
|
||||
}
|
||||
return result;
|
||||
|
@ -1612,6 +1617,9 @@ to guarantee identifier closure.
|
|||
}
|
||||
|
||||
combiningClassSet.set(uData.combiningClass & 0xFF);
|
||||
if (cp == 0xE000) {
|
||||
System.out.println("Check: " + uData.script);
|
||||
}
|
||||
add(uData);
|
||||
}
|
||||
/*
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
|
||||
* $Date: 2005/03/10 02:37:20 $
|
||||
* $Revision: 1.31 $
|
||||
* $Date: 2006/04/05 22:12:44 $
|
||||
* $Revision: 1.32 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -345,7 +345,12 @@ final class UCD_Names implements UCD_Types {
|
|||
"SYLOTI_NAGRI",
|
||||
"OLD_PERSIAN",
|
||||
"KHAROSHTHI",
|
||||
|
||||
"Balinese",
|
||||
"Cuneiform",
|
||||
"Phoenician",
|
||||
"Phags-pa",
|
||||
"Nko",
|
||||
"Unknown"
|
||||
};
|
||||
|
||||
public static final Map EXTRA_SCRIPT = new HashMap();
|
||||
|
@ -426,11 +431,14 @@ final class UCD_Names implements UCD_Types {
|
|||
"Sylo",
|
||||
"Xpeo",
|
||||
"Khar",
|
||||
|
||||
"Bali",
|
||||
"Xsux",
|
||||
"Phnx",
|
||||
"Phag",
|
||||
"Nkoo",
|
||||
"Zzzz"
|
||||
};
|
||||
|
||||
|
||||
|
||||
static final String[] AGE = {
|
||||
"unassigned",
|
||||
"1.1",
|
||||
|
@ -441,9 +449,9 @@ final class UCD_Names implements UCD_Types {
|
|||
"3.2",
|
||||
"4.0",
|
||||
"4.1",
|
||||
"5.0",
|
||||
};
|
||||
|
||||
|
||||
static final String[] GENERAL_CATEGORY = {
|
||||
"Cn", // = Other, Not Assigned 0
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
|
||||
* $Date: 2005/11/01 00:10:54 $
|
||||
* $Revision: 1.32 $
|
||||
* $Date: 2006/04/05 22:12:44 $
|
||||
* $Revision: 1.33 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -398,8 +398,14 @@ public interface UCD_Types {
|
|||
SYLOTI_NAGRI = 60,
|
||||
OLD_PERSIAN = 61,
|
||||
KHAROSHTHI = 62,
|
||||
Balinese = 63,
|
||||
Cuneiform = 64,
|
||||
Phoenician = 65,
|
||||
Phags_Pa = 66,
|
||||
NKo = 67,
|
||||
Unknown_Script = 68,
|
||||
|
||||
LIMIT_SCRIPT = 63;
|
||||
LIMIT_SCRIPT = 69;
|
||||
|
||||
static final int
|
||||
UNKNOWN = 0,
|
||||
|
@ -411,7 +417,8 @@ public interface UCD_Types {
|
|||
AGE32 = 6,
|
||||
AGE40 = 7,
|
||||
AGE41 = 8,
|
||||
LIMIT_AGE = 9;
|
||||
AGE50 = 9,
|
||||
LIMIT_AGE = 10;
|
||||
|
||||
static final String[] AGE_VERSIONS = {
|
||||
"?",
|
||||
|
@ -422,7 +429,8 @@ public interface UCD_Types {
|
|||
"3.1.0",
|
||||
"3.2.0",
|
||||
"4.0.0",
|
||||
"4.1.0"
|
||||
"4.1.0",
|
||||
"5.0.0"
|
||||
};
|
||||
|
||||
public static byte
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
|
||||
* $Date: 2004/02/12 08:23:16 $
|
||||
* $Revision: 1.11 $
|
||||
* $Date: 2006/04/05 22:12:44 $
|
||||
* $Revision: 1.12 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -45,7 +45,7 @@ class UData implements UCD_Types {
|
|||
byte lineBreak = LB_XX;
|
||||
byte joiningType = -1;
|
||||
byte joiningGroup = NO_SHAPING;
|
||||
byte script = COMMON_SCRIPT;
|
||||
byte script = Unknown_Script;
|
||||
byte age = 0;
|
||||
|
||||
static final UData UNASSIGNED = new UData();
|
||||
|
|
|
@ -1,10 +1,4 @@
|
|||
#/**
|
||||
# *******************************************************************************
|
||||
# * Copyright (C) 2002-2004, International Business Machines Corporation and *
|
||||
# * others. All Rights Reserved. *
|
||||
# *******************************************************************************
|
||||
# */
|
||||
#Override List
|
||||
#Override List
|
||||
#Format is <code><tab><char><tab><pinyin>(<tab><comment>)?
|
||||
#Note: the 'code' field is currently discarded; only the char is important.
|
||||
#Note: if there is conflict, the FIRST char wins.
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Counter.java,v $
|
||||
* $Date: 2005/10/11 19:39:15 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2006/04/05 22:12:45 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -21,7 +21,7 @@ import java.io.*;
|
|||
import java.text.*;
|
||||
|
||||
public final class Counter {
|
||||
Map map = new HashMap();
|
||||
private Map map = new HashMap();
|
||||
|
||||
static public final class RWInteger implements Comparable {
|
||||
static int uniqueCount;
|
||||
|
@ -92,5 +92,11 @@ public final class Counter {
|
|||
return result;
|
||||
}
|
||||
|
||||
|
||||
public Set keySet() {
|
||||
return map.keySet();
|
||||
}
|
||||
|
||||
public Map getMap() {
|
||||
return Collections.unmodifiableMap(map);
|
||||
}
|
||||
}
|
|
@ -48,7 +48,7 @@ public class UnicodeDataFile {
|
|||
out.println("# For documentation, see UCD.html");
|
||||
}
|
||||
try {
|
||||
Utility.appendFile(filename + "Header" + fileType, Utility.UTF8_UNIX, out);
|
||||
Utility.appendFile("com/ibm/text/UCD/" + filename + "Header" + fileType, Utility.UTF8_UNIX, out);
|
||||
} catch (FileNotFoundException e) {
|
||||
/*
|
||||
out.println("# Unicode Character Database: Derived Property Data");
|
||||
|
|
|
@ -77,7 +77,10 @@ exist:<br>
|
|||
<UCD_DIR>/EXTRAS-Update</p>
|
||||
<h3>2. Download all of the UnicodeData files for each version into UCD_DIR.</h3>
|
||||
<p>The folder names must be of the form: "3.2.0-Update", so rename the folders on the<br>
|
||||
Unicode site to this format.</p>
|
||||
Unicode site to this format. I<span style="background-color: #FFFF00">f the
|
||||
folder contains ucd, then make the contents of that directory be the contents of
|
||||
the x.x.x-Update directory. That is, each directory will directly contain files
|
||||
like PropList....txt</span></p>
|
||||
<h4>2a Ensure Complete Release</h4>
|
||||
<p>If you are downloading any "incomplete" release (one that does not contain a complete set of data
|
||||
files for that release, you need to also download the previous complete release). Most of the N.M-Update
|
||||
|
@ -87,6 +90,7 @@ directoriess are complete, *except*:</p>
|
|||
<p>Also, make the following changes to UnicodeData for 1.1.5:</p>
|
||||
<p><b>Delete</b></p>
|
||||
<pre>3400;HANGUL SYLLABLE KIYEOK A;Lo;0;L;1100 1161;;;;N;;;;;
|
||||
...
|
||||
4DFF;HANGUL SYLLABLE MIEUM WEO RIEUL-THIEUTH;Lo;0;L;1106 116F 11B4;;;;N;;;;;
|
||||
4E00;<cjk IDEOGRAPH REPRESENTATIVE>;Lo;0;L;;;;;N;;;;;</pre>
|
||||
<p><b>Add:</b></p>
|
||||
|
@ -106,18 +110,19 @@ BASE_DIR + "Collation\allkeys" + VERSION + ".txt".<br>
|
|||
<br>
|
||||
If you have it in a different location, change that value for KEYS in UCA.java, and <br>
|
||||
the value for BASE_DIR</p>
|
||||
<h4>2c. Here is an example of the default directory structure with files:</h4>
|
||||
<h4>2c. Here is an example of the default directory structure with files. All of
|
||||
the yellow ones should exist</h4>
|
||||
<pre>C://DATA/
|
||||
|
||||
BIN/
|
||||
|
||||
Collation/
|
||||
<span style="background-color: #FFFF00"> Collation/
|
||||
allkeys-3.1.1.txt
|
||||
|
||||
</span>
|
||||
GEN/
|
||||
DerivedData/
|
||||
ExtractedProperties
|
||||
UCD/
|
||||
<span style="background-color: #FFFF00"> </span><span style="background-color: #FFFF00">UCD/
|
||||
3.0.0-Update/
|
||||
Unihan-3.2.0.txt
|
||||
...
|
||||
|
@ -133,69 +138,145 @@ the value for BASE_DIR</p>
|
|||
ArabicShaping-4.0.0d14b.txt
|
||||
BidiMirroring-4.0.0d1b.txt
|
||||
...
|
||||
EXTRAS-Update/</pre>
|
||||
EXTRAS-Update/</span></pre>
|
||||
<h3>3. Versions</h3>
|
||||
<p>All of the following have "version X" in the options you give to Java (either on the
|
||||
command line, or in the Eclipse 'run' options. If you want a specific version like 3.1.0, then you
|
||||
would write "version 3.1.1". If you want the latest version (4.1.0), you can omit the "version X".</p>
|
||||
<h3>4. Running UCD, you will use com.ibm.text.UCD.Main as your main class.</h3>
|
||||
<p>The Working directory has to be C:\ICU4J\unicodetools\com\ibm\text\UCD<br>
|
||||
(In Eclipse you can also use ${workspace_loc:UnicodeTools/com/ibm/text/UCD}, which abstracts away
|
||||
the location.)<br>
|
||||
<br>
|
||||
The same for UCA:</p>
|
||||
<p>main: com.ibm.text.UCD.Main<br>
|
||||
directory: <a href="file:///C:/ICU4J/unicodetools/com/ibm/text/UCA">
|
||||
C:\ICU4J\unicodetools\com\ibm\text\UCA</a></p>
|
||||
<h4>4a. BIN</h4>
|
||||
<p>For each version, the tools build a set of binary data in BIN that contain the information for
|
||||
that release. This is done automatically, or you can manually do it with the options<br>
|
||||
<br>
|
||||
version X build<br>
|
||||
<br>
|
||||
This builds an compressed format of all the UCD data (except blocks and Unihan) into the BIN
|
||||
directory. Don't worry about the voluminous console messages, unless one says "FAIL".<br>
|
||||
<br>
|
||||
<font color="#FF0000"><i>You have to manually do this if you change any of the data files in that
|
||||
version!!</i></font></p>
|
||||
<p>Note: if for any reason you modify the binary format of the BIN files, you also have to bump the
|
||||
value in that file:<br>
|
||||
<br>
|
||||
static final byte BINARY_FORMAT = 8; // bumped if binary format of UCD changes</p>
|
||||
<h4>4b. To build the Unicode files for a particular version X, run the Main with the following
|
||||
argument:</h4>
|
||||
<p>MakeUnicodeFiles.generateFile</p>
|
||||
<p>This will execute the commands in the file MakeUnicodeFiles.txt.</p>
|
||||
<p>You will edit that file if you want a different 'd' version for the files, OR if you want to
|
||||
change which files are built. At the top of the file you will see the following text:</p>
|
||||
<pre>Generate: </pre>
|
||||
<pre>DeltaVersion: 7</pre>
|
||||
<h4>4c. To change which files are built, put any number of regular expressions separated by spaces
|
||||
after Generate. Eg,</h4>
|
||||
<pre>Generate: .*line.* prop.*</pre>
|
||||
<p>The matching is case-insensitive.</p>
|
||||
<h4>4d. To change the 'd' number that is appended to the generated files names, change the
|
||||
DeltaVersion.</h4>
|
||||
<h4>4e. To run basic consistency checking, run:</h4>
|
||||
<p>version X verify<br>
|
||||
<br>
|
||||
Don't worry about any console messages except those that say FAIL.</p>
|
||||
<h4>4f. Output</h4>
|
||||
<p>The files will be generated in the GEN directories.</p>
|
||||
<ul>
|
||||
<li>If they are the same as previous files (except for the first line and Date), they will be
|
||||
renamed to UNCHANGED... </li>
|
||||
<li>If they are not, then a bat file will be generated in the DIFF directory. Double-clicking on
|
||||
this file will launch CompareIt, which is a nice diff program. Get compareIt from
|
||||
<a class="xurl" href="http://www.grigsoft.com/files.htm">http://www.grigsoft.com/files.htm</a> (be
|
||||
sure to get the Unicode version),then you can also set it as the diff program in CVS with
|
||||
Admin/Preferences/WinCVS, External Diff = C:\Program Files\Compare It!\wincmp3.exe (or equiv).</li>
|
||||
</ul>
|
||||
<h3>5. Running UCA, you will use com.ibm.text.UCA.Main as your main class.</h3>
|
||||
<h4>5a. To build all the UCA files used by ICU, use the option:</h4>
|
||||
<p>java <UCA>Main ICU</p>
|
||||
<h4>6. To build all the charts, use the UCA project, with options: normalizationChart caseChart
|
||||
scriptChart indexChart</h4>
|
||||
<h3>4. Building Files</h3>
|
||||
<ol>
|
||||
<li><b>Setup</b><ol>
|
||||
<li>In Eclipse, open the Package Explorer (Use Window>Show View if you
|
||||
don't see it)</li>
|
||||
<li>Open UnicodeTools<ul>
|
||||
<li>com.ibm.text.UCD<ul>
|
||||
<li>MakeUnicodeFiles.<span style="background-color: #FFFF00">txt</span><p>This file drives the production of
|
||||
the derived Unicode files. The first three lines contain
|
||||
parameters that you may want to modify at some times:</p>
|
||||
<pre>Generate: <b>.*script.*</b> <i>// this is a regular expression. Use .* for all files</i>
|
||||
DeltaVersion: <b>10</b> <i> // This gets appended to the file name. Pick 1+ the highest value in Public</i>
|
||||
CopyrightYear: <b>2006</b> <i> // Pick the current year</i></pre>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Open in Package Explorer
|
||||
<ul>
|
||||
<li>com.ibm.text.UCD<ul>
|
||||
<li>Main</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Run>Run As...<ol>
|
||||
<li>Choose Java Application<ul>
|
||||
<li>it will fail, don't worry; you need to set some parameters</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ol>
|
||||
</li>
|
||||
<li>Run>Run...<ul>
|
||||
<li>Select the Arguments tab, and fill in the following<ul>
|
||||
<li>Program arguments:<pre>build 5.0 MakeUnicodeFiles</pre>
|
||||
</li>
|
||||
<li>VM arguments:
|
||||
<pre>-Xms512m -Xmx512m</pre>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Close and Save</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ol>
|
||||
</li>
|
||||
<li><b>Run</b><ol>
|
||||
<li>You'll see it build the 5.0 files, with something like the following
|
||||
results:<pre>Writing UCD_Data5.0.0
|
||||
Data Size: 109,802
|
||||
Wrote Data 109802</pre>
|
||||
</li>
|
||||
<li>For each version, the tools build a set of binary data in BIN that
|
||||
contain the information for that release. This is done automatically, or
|
||||
you can manually do it with the Program Arguments<pre>version X build</pre>
|
||||
<p>This builds an compressed format of all the UCD data (except blocks
|
||||
and Unihan) into the BIN directory. Don't worry about the voluminous
|
||||
console messages, unless one says "FAIL".</p>
|
||||
<p><font color="#FF0000"><i>You have to manually do this if you change
|
||||
any of the data files in that version!</i></font></p>
|
||||
<p>Note: if for any reason you modify the binary format of the BIN files, you also have to bump the
|
||||
value in that file:</p>
|
||||
<pre>static final byte BINARY_FORMAT = 8; // bumped if binary format of UCD changes</pre>
|
||||
</li>
|
||||
</ol>
|
||||
</li>
|
||||
<li>Results in <a href="file:///C:/DATA/GEN/DerivedData">
|
||||
C:\DATA\GEN\DerivedData</a><ol>
|
||||
<li>The files will be in this directory.</li>
|
||||
<li>There are also DIFF folders, that contain BAT files that you can run
|
||||
on Windows with CompareIt. (You can modify the code to build BATs with
|
||||
another Diff program if you want).<ol>
|
||||
<li>For any file with a significant difference, it will build two
|
||||
BAT files, such as the first two below.<pre>Diff_PropList-5.0.0d10.txt.bat
|
||||
OLDER-Diff_PropList-5.0.0d10.txt.bat
|
||||
|
||||
UNCHANGED-Diff_PropertyValueAliases-5.0.0d10.txt.bat</pre>
|
||||
</li>
|
||||
</ol>
|
||||
</li>
|
||||
<li>Any files without significant changes will have "UNCHANGED" as a
|
||||
prefix: ignore them. The OLDER prefix is the comparison to the
|
||||
last version of Unicode.</li>
|
||||
<li>On Windows you can run these BATs to compare files:</li>
|
||||
</ol>
|
||||
</li>
|
||||
</ol>
|
||||
<h3>5. Invariant Checking</h3>
|
||||
<ol>
|
||||
<li>Setup<ol>
|
||||
<li>Open in Package Explorer<ul>
|
||||
<li>com.ibm.text.UCD<ul>
|
||||
<li>TestUnicodeInvariants.java</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Run>Run As... Java Application<br>
|
||||
Will create the following file of results:<pre><a href="file:///C:/DATA/GEN/UnicodeInvariantResults.txt/">C:\DATA\GEN\UnicodeInvariantResults.txt\</a></pre>
|
||||
</li>
|
||||
<li>Open that file and search for "**** START Error Info ****" Each such
|
||||
point provides a dump of comparison information.</li>
|
||||
</ol>
|
||||
</li>
|
||||
</ol>
|
||||
<h3>6. Options</h3>
|
||||
<ol>
|
||||
<li>If you want to see files that are opened while processing, do the
|
||||
following:<ol>
|
||||
<li>Run>Run</li>
|
||||
<li>Select the Arguments tab, and add the following<ol>
|
||||
<li>VM arguments:
|
||||
<pre>-DSHOW_FILES</pre>
|
||||
</li>
|
||||
</ol>
|
||||
</li>
|
||||
</ol>
|
||||
</li>
|
||||
</ol>
|
||||
<h3>5. UCA</h3>
|
||||
<ol>
|
||||
<li>
|
||||
<h3>You will use com.ibm.text.UCA.Main as your main class, creating along
|
||||
the same lines as above.</h3></li>
|
||||
<li>
|
||||
<h4>To build all the UCA files used by ICU, use the Program arguments:</h4>
|
||||
<pre>Main ICU</pre>
|
||||
</li>
|
||||
<li>
|
||||
<h4>To build all the charts, use the UCA project, with options: </h4>
|
||||
<pre>normalizationChart caseChart scriptChart indexChart</pre>
|
||||
</li>
|
||||
</ol>
|
||||
|
||||
</body>
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue