mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 17:24:01 +00:00
ICU-4700 Misc tools
X-SVN-Rev: 18741
This commit is contained in:
parent
ddcee69efa
commit
b120a3251b
14 changed files with 853 additions and 196 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
|
||||
* $Date: 2004/11/12 23:17:15 $
|
||||
* $Revision: 1.16 $
|
||||
* $Date: 2005/11/01 00:10:53 $
|
||||
* $Revision: 1.17 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -396,142 +396,145 @@ public final class ConvertUCD implements UCD_Types {
|
|||
try {
|
||||
String[] parts = new String[20];
|
||||
for (int lineNumber = 1; ; ++lineNumber) {
|
||||
line = input.readLine();
|
||||
if (line == null) break;
|
||||
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
|
||||
try {
|
||||
line = input.readLine();
|
||||
if (line == null) break;
|
||||
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
|
||||
|
||||
String original = line;
|
||||
String comment = "";
|
||||
int commentPos = line.indexOf('#');
|
||||
if (commentPos >= 0) {
|
||||
comment = line.substring(commentPos+1).trim();
|
||||
line = line.substring(0, commentPos);
|
||||
}
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
String original = line;
|
||||
String comment = "";
|
||||
int commentPos = line.indexOf('#');
|
||||
if (commentPos >= 0) {
|
||||
comment = line.substring(commentPos+1).trim();
|
||||
line = line.substring(0, commentPos);
|
||||
}
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
int count = Utility.split(line,';',parts);
|
||||
int count = Utility.split(line,';',parts);
|
||||
|
||||
if (false && parts[0].equals("2801")) {
|
||||
System.out.println("debug?");
|
||||
}
|
||||
if (false && parts[0].equals("2801")) {
|
||||
System.out.println("debug?");
|
||||
}
|
||||
|
||||
// fix malformed or simple lists.
|
||||
// fix malformed or simple lists.
|
||||
|
||||
if (count != labels.length) {
|
||||
if (count == labels.length + 1 && parts[count-1].equals("")) {
|
||||
if (!showedSemi) System.out.println("Extra semicolon in: " + original);
|
||||
showedSemi = true;
|
||||
} else if (count == 1) { // fix simple list
|
||||
++count;
|
||||
parts[1] = "Y";
|
||||
} else if (count < labels.length) {
|
||||
if (!showedShort) System.out.println("Line shorter than labels: " + original);
|
||||
showedShort = true;
|
||||
for (int i = count; i < labels.length; ++i) {
|
||||
parts[i] = "";
|
||||
}
|
||||
} else {
|
||||
throw new ChainException("wrong count: {0}",
|
||||
new Object[] {new Integer(line), new Integer(count)});
|
||||
}
|
||||
}
|
||||
if (count != labels.length) {
|
||||
if (count == labels.length + 1 && parts[count-1].equals("")) {
|
||||
if (!showedSemi) System.out.println("Extra semicolon in: " + original);
|
||||
showedSemi = true;
|
||||
} else if (count == 1) { // fix simple list
|
||||
++count;
|
||||
parts[1] = "Y";
|
||||
} else if (count < labels.length) {
|
||||
if (!showedShort) System.out.println("Line shorter than labels: " + original);
|
||||
showedShort = true;
|
||||
for (int i = count; i < labels.length; ++i) {
|
||||
parts[i] = "";
|
||||
}
|
||||
} else {
|
||||
throw new ChainException("wrong count: {0}",
|
||||
new Object[] {new Integer(line), new Integer(count)});
|
||||
}
|
||||
}
|
||||
|
||||
// store char
|
||||
// first field is always character OR range. May be UTF-32
|
||||
int cpTop;
|
||||
int cpStart;
|
||||
int ddot = parts[0].indexOf(".");
|
||||
if (ddot >= 0) {
|
||||
cpStart = UTF32.char32At(Utility.fromHex(parts[0].substring(0,ddot)),0);
|
||||
cpTop = UTF32.char32At(Utility.fromHex(parts[0].substring(ddot+2)),0);
|
||||
// System.out.println(Utility.hex(cpStart) + " ... " + Utility.hex(cpTop));
|
||||
} else {
|
||||
cpStart = UTF32.char32At(Utility.fromHex(parts[0]),0);
|
||||
cpTop = cpStart;
|
||||
if (labels[1].equals("RANGE")) UTF32.char32At(Utility.fromHex(parts[1]),0);
|
||||
}
|
||||
// store char
|
||||
// first field is always character OR range. May be UTF-32
|
||||
int cpTop;
|
||||
int cpStart;
|
||||
int ddot = parts[0].indexOf(".");
|
||||
if (ddot >= 0) {
|
||||
cpStart = UTF32.char32At(Utility.fromHex(parts[0].substring(0,ddot)),0);
|
||||
cpTop = UTF32.char32At(Utility.fromHex(parts[0].substring(ddot+2)),0);
|
||||
// System.out.println(Utility.hex(cpStart) + " ... " + Utility.hex(cpTop));
|
||||
} else {
|
||||
cpStart = UTF32.char32At(Utility.fromHex(parts[0]),0);
|
||||
cpTop = cpStart;
|
||||
if (labels[1].equals("RANGE")) UTF32.char32At(Utility.fromHex(parts[1]),0);
|
||||
}
|
||||
|
||||
// properties first
|
||||
if (labels[1].equals("PROP")) {
|
||||
String prop = parts[2].trim();
|
||||
// FIX!!
|
||||
boolean skipLetters = false;
|
||||
if (prop.equals("Alphabetic")) {
|
||||
prop = "Other_Alphabetic";
|
||||
skipLetters = true;
|
||||
}
|
||||
// END FIX!!
|
||||
properties.add(prop);
|
||||
if (Utility.find(prop, UCD_Names.DeletedProperties, true) == -1) { // only undeleted
|
||||
int end = UTF32.char32At(Utility.fromHex(parts[1]),0);
|
||||
if (end == 0) end = cpStart;
|
||||
|
||||
for (int j = cpStart; j <= end; ++j) {
|
||||
if (j != UCD.mapToRepresentative(j, Integer.MAX_VALUE)) continue;
|
||||
if (skipLetters && getEntry(cpStart).isLetter()) continue;
|
||||
appendCharProperties(j, prop);
|
||||
}
|
||||
}
|
||||
} else { // not range!
|
||||
String val = "";
|
||||
String lastVal;
|
||||
|
||||
// properties first
|
||||
if (labels[1].equals("PROP")) {
|
||||
String prop = parts[2].trim();
|
||||
// FIX!!
|
||||
boolean skipLetters = false;
|
||||
if (prop.equals("Alphabetic")) {
|
||||
prop = "Other_Alphabetic";
|
||||
skipLetters = true;
|
||||
}
|
||||
// END FIX!!
|
||||
properties.add(prop);
|
||||
if (Utility.find(prop, UCD_Names.DeletedProperties, true) == -1) { // only undeleted
|
||||
int end = UTF32.char32At(Utility.fromHex(parts[1]),0);
|
||||
if (end == 0) end = cpStart;
|
||||
for (int i = 1; i < labels.length; ++i) {
|
||||
String key = labels[i];
|
||||
lastVal = val;
|
||||
if (isHex.get(key) != null) {
|
||||
val = Utility.fromHex(parts[i]);
|
||||
} else {
|
||||
val = parts[i].trim();
|
||||
}
|
||||
if (key.equals("OMIT")) continue; // do after val, so lastVal is correct
|
||||
if (key.equals("RANGE")) continue; // do after val, so lastVal is correct
|
||||
if (val.equals("")) continue; // skip empty values, they mean default
|
||||
|
||||
for (int j = cpStart; j <= end; ++j) {
|
||||
if (j != UCD.mapToRepresentative(j, Integer.MAX_VALUE)) continue;
|
||||
if (skipLetters && getEntry(cpStart).isLetter()) continue;
|
||||
appendCharProperties(j, prop);
|
||||
}
|
||||
}
|
||||
} else { // not range!
|
||||
String val = "";
|
||||
String lastVal;
|
||||
for (int cps = cpStart; cps <= cpTop; ++cps) {
|
||||
if (UCD.mapToRepresentative(cps, Integer.MAX_VALUE) != cps) continue; // skip condensed ranges
|
||||
|
||||
for (int i = 1; i < labels.length; ++i) {
|
||||
String key = labels[i];
|
||||
lastVal = val;
|
||||
if (isHex.get(key) != null) {
|
||||
val = Utility.fromHex(parts[i]);
|
||||
} else {
|
||||
val = parts[i].trim();
|
||||
}
|
||||
if (key.equals("OMIT")) continue; // do after val, so lastVal is correct
|
||||
if (key.equals("RANGE")) continue; // do after val, so lastVal is correct
|
||||
if (val.equals("")) continue; // skip empty values, they mean default
|
||||
|
||||
for (int cps = cpStart; cps <= cpTop; ++cps) {
|
||||
if (UCD.mapToRepresentative(cps, Integer.MAX_VALUE) != cps) continue; // skip condensed ranges
|
||||
|
||||
if (key.equals("binary")) {
|
||||
appendCharProperties(cps, val);
|
||||
} else if (key.equals("fc")) {
|
||||
UData data = getEntry(cps);
|
||||
String type = parts[i-1].trim();
|
||||
if (type.equals("F") || type.equals("C") || type.equals("E") || type.equals("L")) {
|
||||
data.fullCaseFolding = val;
|
||||
//System.out.println("*<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
|
||||
}
|
||||
if (type.equals("S") || type.equals("C") || type.equals("L")) {
|
||||
data.simpleCaseFolding = val;
|
||||
//System.out.println("<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
|
||||
}
|
||||
if (type.equals("I")) {
|
||||
data.simpleCaseFolding = val;
|
||||
setBinaryProperty(cps, CaseFoldTurkishI);
|
||||
if (DEBUG) System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting "
|
||||
+ Utility.hex(cps) + ": " + Utility.hex(val));
|
||||
}
|
||||
} else if (labels[0].equals("SpecialCasing") // special handling for special casing
|
||||
&& labels[4].equals("sc")
|
||||
&& parts[4].trim().length() > 0) {
|
||||
if (i < 4) {
|
||||
if (DEBUG) System.out.println("Got special: " + Utility.hex(cps) + ", "
|
||||
+ Utility.hex(key) + ":" + Utility.hex(val));
|
||||
addCharData(cps, "sc", parts[4].trim() + ":" + key + ":" + val);
|
||||
}
|
||||
} else {
|
||||
/*if (key.equals("sn")) { // SKIP UNDEFINED!!
|
||||
UData data = getEntryIfExists(cps);
|
||||
if (data == null || data.generalCategory == Cn) continue;
|
||||
}
|
||||
*/
|
||||
addCharData(cps, key, val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (key.equals("binary")) {
|
||||
appendCharProperties(cps, val);
|
||||
} else if (key.equals("fc")) {
|
||||
UData data = getEntry(cps);
|
||||
String type = parts[i-1].trim();
|
||||
if (type.equals("F") || type.equals("C") || type.equals("E") || type.equals("L")) {
|
||||
data.fullCaseFolding = val;
|
||||
//System.out.println("*<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
|
||||
}
|
||||
if (type.equals("S") || type.equals("C") || type.equals("L")) {
|
||||
data.simpleCaseFolding = val;
|
||||
//System.out.println("<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
|
||||
}
|
||||
if (type.equals("I")) {
|
||||
data.simpleCaseFolding = val;
|
||||
setBinaryProperty(cps, CaseFoldTurkishI);
|
||||
if (DEBUG) System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting "
|
||||
+ Utility.hex(cps) + ": " + Utility.hex(val));
|
||||
}
|
||||
} else if (labels[0].equals("SpecialCasing") // special handling for special casing
|
||||
&& labels[4].equals("sc")
|
||||
&& parts[4].trim().length() > 0) {
|
||||
if (i < 4) {
|
||||
if (DEBUG) System.out.println("Got special: " + Utility.hex(cps) + ", "
|
||||
+ Utility.hex(key) + ":" + Utility.hex(val));
|
||||
addCharData(cps, "sc", parts[4].trim() + ":" + key + ":" + val);
|
||||
}
|
||||
} else {
|
||||
/*if (key.equals("sn")) { // SKIP UNDEFINED!!
|
||||
UData data = getEntryIfExists(cps);
|
||||
if (data == null || data.generalCategory == Cn) continue;
|
||||
}
|
||||
*/
|
||||
addCharData(cps, key, val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.err.println("*Exception at: " + line + ", " + e.getMessage());
|
||||
//System.err.println(e.getMessage());
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.out.println("Exception at: " + line + ", " + e.getMessage());
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
|
||||
* $Date: 2005/07/19 17:21:00 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2005/11/01 00:10:53 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -290,7 +290,7 @@ public class GenerateConfusables {
|
|||
lowerIsBetter.putAll(remainingOutputSet, MARK_ASCII);
|
||||
lowerIsBetter.setMissing(MARK_NOT_NFC);
|
||||
|
||||
lowerIsBetter.lock();
|
||||
lowerIsBetter.freeze();
|
||||
// add special values:
|
||||
//lowerIsBetter.putAll(new UnicodeSet("["), new Integer(0));
|
||||
|
||||
|
@ -321,11 +321,11 @@ public class GenerateConfusables {
|
|||
PROHIBITED + NOT_IN_XID);
|
||||
removals2.setMissing("future?");
|
||||
|
||||
additions.lock();
|
||||
remap.lock();
|
||||
removals.lock();
|
||||
reviews.lock();
|
||||
removals2.lock();
|
||||
additions.freeze();
|
||||
remap.freeze();
|
||||
removals.freeze();
|
||||
reviews.freeze();
|
||||
removals2.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -431,8 +431,8 @@ public class GenerateConfusables {
|
|||
//reviews.putAll(UNASSIGNED, "");
|
||||
out.print("\uFEFF");
|
||||
out.println("# Review List for IDN");
|
||||
out.println("# $Revision: 1.7 $");
|
||||
out.println("# $Date: 2005/07/19 17:21:00 $");
|
||||
out.println("# $Revision: 1.8 $");
|
||||
out.println("# $Date: 2005/11/01 00:10:53 $");
|
||||
out.println("");
|
||||
|
||||
UnicodeSet fullSet = reviews.getSet("").complement();
|
||||
|
@ -487,8 +487,8 @@ public class GenerateConfusables {
|
|||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "idnchars.txt");
|
||||
|
||||
out.println("# Recommended Identifier Profiles for IDN");
|
||||
out.println("# $Revision: 1.7 $");
|
||||
out.println("# $Date: 2005/07/19 17:21:00 $");
|
||||
out.println("# $Revision: 1.8 $");
|
||||
out.println("# $Date: 2005/11/01 00:10:53 $");
|
||||
|
||||
out.println("");
|
||||
out.println("# Output Characters");
|
||||
|
@ -557,8 +557,8 @@ public class GenerateConfusables {
|
|||
"xidmodifications.txt");
|
||||
|
||||
out.println("# Security Profile for General Identifiers");
|
||||
out.println("# $Revision: 1.7 $");
|
||||
out.println("# $Date: 2005/07/19 17:21:00 $");
|
||||
out.println("# $Revision: 1.8 $");
|
||||
out.println("# $Date: 2005/11/01 00:10:53 $");
|
||||
out.println("");
|
||||
|
||||
out.println("# Characters restricted");
|
||||
|
@ -614,8 +614,8 @@ public class GenerateConfusables {
|
|||
//someRemovals = removals;
|
||||
out = BagFormatter.openUTF8Writer(outdir, "draft-restrictions.txt");
|
||||
out.println("# Characters restricted in domain names");
|
||||
out.println("# $Revision: 1.7 $");
|
||||
out.println("# $Date: 2005/07/19 17:21:00 $");
|
||||
out.println("# $Revision: 1.8 $");
|
||||
out.println("# $Date: 2005/11/01 00:10:53 $");
|
||||
out.println("#");
|
||||
out.println("# This file contains a draft list of characters for use in");
|
||||
out.println("# UTR #36: Unicode Security Considerations");
|
||||
|
@ -1149,8 +1149,8 @@ public class GenerateConfusables {
|
|||
public void writeSource(String directory, String filename) throws IOException {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
|
||||
out.println("# Source File for IDN Confusables");
|
||||
out.println("# $Revision: 1.7 $");
|
||||
out.println("# $Date: 2005/07/19 17:21:00 $");
|
||||
out.println("# $Revision: 1.8 $");
|
||||
out.println("# $Date: 2005/11/01 00:10:53 $");
|
||||
out.println("");
|
||||
dataMixedAnycase.writeSource(out);
|
||||
out.close();
|
||||
|
@ -1160,8 +1160,8 @@ public class GenerateConfusables {
|
|||
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
|
||||
out.print('\uFEFF');
|
||||
out.println("# Recommended confusable mapping for IDN");
|
||||
out.println("# $Revision: 1.7 $");
|
||||
out.println("# $Date: 2005/07/19 17:21:00 $");
|
||||
out.println("# $Revision: 1.8 $");
|
||||
out.println("# $Date: 2005/11/01 00:10:53 $");
|
||||
out.println("");
|
||||
|
||||
if (appendFile) {
|
||||
|
@ -1369,8 +1369,8 @@ public class GenerateConfusables {
|
|||
UnicodeSet representable = new UnicodeSet();
|
||||
out.print('\uFEFF');
|
||||
out.println("# Summary: Recommended confusable mapping for IDN");
|
||||
out.println("# $Revision: 1.7 $");
|
||||
out.println("# $Date: 2005/07/19 17:21:00 $");
|
||||
out.println("# $Revision: 1.8 $");
|
||||
out.println("# $Date: 2005/11/01 00:10:53 $");
|
||||
out.println("");
|
||||
MyEquivalenceClass data = dataMixedAnycase;
|
||||
Set items = data.getOrderedExplicitItems();
|
||||
|
@ -1494,8 +1494,8 @@ public class GenerateConfusables {
|
|||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
|
||||
out.print('\uFEFF');
|
||||
out.println("# Summary: Whole-Script Confusables");
|
||||
out.println("# $Revision: 1.7 $");
|
||||
out.println("# $Date: 2005/07/19 17:21:00 $");
|
||||
out.println("# $Revision: 1.8 $");
|
||||
out.println("# $Date: 2005/11/01 00:10:53 $");
|
||||
out.println("# This data is used for determining whether a strings is a");
|
||||
out.println("# whole-script or mixed-script confusable.");
|
||||
out.println("# The mappings here ignore common and inherited script characters,");
|
||||
|
|
|
@ -206,7 +206,7 @@ class GenerateStringPrep implements UCD_Types {
|
|||
return a + "\t" + b;
|
||||
}
|
||||
};
|
||||
UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose);
|
||||
UnicodeMap sb = ((UnicodeMap)scripts.cloneAsThawed()).composeWith(blocks, myCompose);
|
||||
for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) {
|
||||
System.out.println(it.next());
|
||||
}
|
||||
|
|
501
tools/unicodetools/com/ibm/text/UCD/MakeNamesChart.java
Normal file
501
tools/unicodetools/com/ibm/text/UCD/MakeNamesChart.java
Normal file
|
@ -0,0 +1,501 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodePropertySource;
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.Replaceable;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.text.utility.Utility;
|
||||
import com.ibm.text.utility.Utility.Encoding;
|
||||
|
||||
public class MakeNamesChart {
|
||||
|
||||
static int lastCodePoint = -1;
|
||||
static boolean lastCodePointIsOld = false;
|
||||
static int lastDecompType = UCD.NONE;
|
||||
|
||||
static final String chartPrefix = "c_";
|
||||
static final String namePrefix = "n_";
|
||||
|
||||
static UnicodeSet skipChars;// = new UnicodeSet("[[:gc=cn:]-[:noncharactercodepoint:]]");
|
||||
static UnicodeSet rtl;// = new UnicodeSet("[[:bidiclass=r:][:bidiclass=al:]]");
|
||||
static UnicodeSet usePicture;// = new UnicodeSet("[[:whitespace:][:defaultignorablecodepoint:]]");
|
||||
|
||||
static UCD ucd41;
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
//ConvertUCD.main(new String[]{"5.0.0"});
|
||||
BlockInfo blockInfo = new BlockInfo("5.0.0", "NamesList.txt");
|
||||
// http://www.unicode.org/~book/incoming/kenfiles/U50M051010.lst
|
||||
Default.setUCD("5.0.0");
|
||||
ucd41 = UCD.make("4.1.0");
|
||||
ToolUnicodePropertySource up = ToolUnicodePropertySource.make("5.0.0");
|
||||
skipChars = new UnicodeSet(up.getSet("gc=cn")).removeAll(up.getSet("gc=cn"));
|
||||
//"[[:gc=cn:]-[:noncharactercodepoint:]]");
|
||||
rtl = new UnicodeSet(up.getSet("bidiclass=r")).addAll(up.getSet("bidiclass=al"));// "[[:bidiclass=r:][:bidiclass=al:]]");
|
||||
usePicture = new UnicodeSet(up.getSet("whitespace=true")).addAll(up.getSet("defaultignorablecodepoint=true"));// new UnicodeSet("[[:whitespace:][:defaultignorablecodepoint:]]");
|
||||
|
||||
List nameList = new ArrayList();
|
||||
ArrayList lines = new ArrayList();
|
||||
UnicodeSet collectedCodePoints = new UnicodeSet();
|
||||
BitSet nameListNew = new BitSet();
|
||||
|
||||
int limit = Integer.MAX_VALUE;
|
||||
for (int count = 0; count < limit; ++count) {
|
||||
if (!blockInfo.next(lines)) break;
|
||||
String firstLine = (String)lines.get(0);
|
||||
if (firstLine.startsWith("@@@")) continue;
|
||||
String[] lineParts = firstLine.split("\t");
|
||||
String fileName = lineParts[1] + ".html";
|
||||
nameList.add(firstLine);
|
||||
System.out.println();
|
||||
System.out.println("file: " + chartPrefix + fileName);
|
||||
PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", chartPrefix + fileName);
|
||||
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>" +
|
||||
BagFormatter.toHTML.transliterate(getHeading(lineParts[2])) +
|
||||
"</title><link rel='stylesheet' type='text/css' href='namelist.css'>" +
|
||||
"<base target='names'></head><body>");
|
||||
|
||||
// header
|
||||
out.println("<table class='headerTable'><tr><td class='headerLeft'>" +
|
||||
lineParts[1] +
|
||||
" <a href='help.html'>help</a></td><td class='headerCenter'>" +
|
||||
getHeading(lineParts[2]) +
|
||||
"</td><td class='headerRight'><a href='mainList.html'>index</a> " +
|
||||
lineParts[3] +
|
||||
"</td></tr></table>");
|
||||
|
||||
if ("Unassigned".equals(lineParts[2])) {
|
||||
System.out.println("debug");
|
||||
}
|
||||
// first pass through and collect all the code points
|
||||
collectedCodePoints.clear();
|
||||
for (int i = 1; i < lines.size(); ++i) {
|
||||
String line = (String)lines.get(i);
|
||||
int cp1 = line.charAt(0);
|
||||
if (cp1 != '@' && cp1 != '\t') {
|
||||
int cp = Integer.parseInt(line.split("\t")[0],16);
|
||||
collectedCodePoints.add(cp);
|
||||
}
|
||||
}
|
||||
collectedCodePoints.removeAll(skipChars);
|
||||
if (collectedCodePoints.size() == 0) {
|
||||
out.println("<p align='center'>No Names List</p>");
|
||||
} else {
|
||||
out.println("<div align='center'><table class='chart'><tr>");
|
||||
int counter = 0;
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(collectedCodePoints); it.next();) {
|
||||
if ((counter % 16) == 0 && counter != 0) {
|
||||
out.println("</tr><tr>");
|
||||
}
|
||||
String tdclass = "cell";
|
||||
if (counter < 16) tdclass = "cellw";
|
||||
if (it.codepoint == 0x242) {
|
||||
System.out.println("debug");
|
||||
}
|
||||
boolean isNew = isNew(it.codepoint);
|
||||
if (isNew) tdclass += "new";
|
||||
String hexcp = Utility.hex(it.codepoint, 4);
|
||||
String title = "";
|
||||
String name = Default.ucd().getName(it.codepoint);
|
||||
if (name != null) title = " title='" + BagFormatter.toHTML.transliterate(name.toLowerCase()) + "'";
|
||||
out.println("<td class='" + tdclass + "'"
|
||||
+ title
|
||||
+ ">\u00A0"
|
||||
+ showChar(it.codepoint) + "\u00A0<br><tt><a href='" + namePrefix + fileName + "#"+ hexcp + "'>" +
|
||||
hexcp + "</a></tt></td>");
|
||||
counter++;
|
||||
}
|
||||
if (counter > 16) {
|
||||
counter &= 0xF;
|
||||
if (counter != 0) for (; counter < 16; ++counter) out.println("<td class='cell'>\u00A0</td>");
|
||||
out.println("</tr></table></div>");
|
||||
}
|
||||
}
|
||||
out.close();
|
||||
out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", namePrefix + fileName);
|
||||
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>" +
|
||||
"<link rel='stylesheet' type='text/css' href='namelist.css'></head><body>");
|
||||
|
||||
// now do the characters
|
||||
boolean inTable = false;
|
||||
for (int i = 1; i < lines.size(); ++i) {
|
||||
String line = (String)lines.get(i);
|
||||
try {
|
||||
if (line.startsWith("@")) {
|
||||
finishItem(out);
|
||||
if (inTable) {
|
||||
out.println("</table>");
|
||||
inTable = false;
|
||||
}
|
||||
if (line.startsWith("@+")) {
|
||||
line = line.substring(2).trim();
|
||||
out.println("<p class='comment'>"
|
||||
+ line
|
||||
+ "</p>");
|
||||
} else {
|
||||
line = line.substring(1).trim();
|
||||
out.println("<h2>"
|
||||
+ line
|
||||
+ "</h2>");
|
||||
}
|
||||
} else {
|
||||
if (!inTable) {
|
||||
out.println("<table>");
|
||||
inTable = true;
|
||||
}
|
||||
//String line2 = lineParts[1];
|
||||
if (line.startsWith("\t")) {
|
||||
String body = line.trim();
|
||||
if (false && line.indexOf(body) != 1) {
|
||||
System.out.println("Format error: too much inital whitespace: <" + line + ">");
|
||||
}
|
||||
char firstChar = body.charAt(0);
|
||||
switch (firstChar) {
|
||||
case '*': body = "\u2022 " + body.substring(2); break;
|
||||
case ':': body = checkCanonical(lastCodePoint, body); break;
|
||||
case '#': body = checkCompatibility(lastCodePoint, body); break;
|
||||
case 'x': body = getOther(body); break;
|
||||
case '=': break;
|
||||
default: throw new IllegalArgumentException("Huh? " + body);
|
||||
}
|
||||
out.println("<tr><td>\u00A0</td><td>\u00A0</td><td>"
|
||||
+ maybeNameStyle(showTextConvertingHex(body, firstChar != '='), firstChar == '=')
|
||||
+ "</td></tr>");
|
||||
} else {
|
||||
finishItem(out);
|
||||
lineParts = line.split("\t");
|
||||
String x = lineParts[0];
|
||||
lastCodePoint = Integer.parseInt(x,16);
|
||||
boolean lastCodePointIsNew = isNew(lastCodePoint);
|
||||
if (lastCodePointIsNew) nameListNew.set(nameList.size()-1, true);
|
||||
out.println("<tr><td"
|
||||
+ (lastCodePointIsNew ? " class='new'" : "")
|
||||
+ "><code><a name='" + x + "'>" + x + "</a></code></td><td>\u00A0"
|
||||
+ showChar(lastCodePoint) + "\u00A0</td><td"
|
||||
+ (lastCodePointIsNew ? " class='new'" : "") + ">"
|
||||
+ nameStyle(showTextConvertingHex(lineParts[1], false)) + "</td></tr>");
|
||||
lastDecompType = Default.ucd().getDecompositionType(lastCodePoint);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw (IllegalArgumentException) new IllegalArgumentException("Error on line: " + line)
|
||||
.initCause(e);
|
||||
}
|
||||
}
|
||||
finishItem(out);
|
||||
out.close();
|
||||
}
|
||||
blockInfo.in.close();
|
||||
PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", "mainList.html");
|
||||
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>" +
|
||||
"<title>Main List</title><link rel='stylesheet' type='text/css' href='namelist.css'>" +
|
||||
"<base target='chart'></head><body><table>");
|
||||
for (int i = 0; i < nameList.size(); ++i) {
|
||||
String line = (String) nameList.get(i);
|
||||
String[] lineParts = line.split("\t");
|
||||
String fileName = lineParts[1] + ".html";
|
||||
out.println("<tr><td><code>" + lineParts[1] +
|
||||
"</code></td><td"
|
||||
+ (nameListNew.get(i) ? " class='new'" : "")
|
||||
+ "><a href='" + chartPrefix + fileName + "'>" + getHeading(lineParts[2]) + "</a></td><td><code>" +
|
||||
lineParts[3] +"</code></td></tr>");
|
||||
}
|
||||
out.println("</table></body></html>");
|
||||
out.close();
|
||||
BagFormatter bf = new BagFormatter();
|
||||
//System.out.println(bf.showSetDifferences("Has name in decomps", hasName, "Has no name in decomps", hasNoName));
|
||||
System.out.println("Name differences: Canonical");
|
||||
showNameDifferences(hasNameCan, hasNoNameCan);
|
||||
System.out.println("Name differences: Compatibility");
|
||||
showNameDifferences(hasNameComp, hasNoNameComp);
|
||||
// System.out.println("Characters with names in decomps: " + hasName.toPattern(true));
|
||||
// System.out.println("Characters without names in decomps: " + hasNoName.toPattern(true));
|
||||
// System.out.println("Characters sometimes with, sometimes without names in decomps: " + both.toPattern(true));
|
||||
System.out.println("Done");
|
||||
}
|
||||
|
||||
private static boolean isNew(int codepoint) {
|
||||
return Default.ucd().isAllocated(codepoint) && !ucd41.isAllocated(codepoint);
|
||||
}
|
||||
|
||||
private static void showNameDifferences(Map hasName, Map hasNoName) {
|
||||
Set both = new TreeSet(hasNoName.keySet());
|
||||
both.retainAll(hasName.keySet());
|
||||
//hasNoName.removeAll(both);
|
||||
//hasName.removeAll(both);
|
||||
for (Iterator it = both.iterator(); it.hasNext();) {
|
||||
String decomp = (String) it.next();
|
||||
System.out.println();
|
||||
System.out.println("decomp: " + Utility.hex(decomp));
|
||||
System.out.println("Has name in: " + Utility.hex((String)hasName.get(decomp)));
|
||||
System.out.println("Has no name in: " + Utility.hex((String)hasNoName.get(decomp)));
|
||||
}
|
||||
System.out.println("Count: " + both.size());
|
||||
}
|
||||
|
||||
static TestIdentifiers ti;
|
||||
static {
|
||||
try {
|
||||
ti = new TestIdentifiers("L");
|
||||
} catch (IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private static void finishItem(PrintWriter out) {
|
||||
if (lastCodePoint < 0) return;
|
||||
if (lastDecompType != UCD.NONE) {
|
||||
System.out.println("Alert: missing decomp for " + Utility.hex(lastCodePoint));
|
||||
}
|
||||
String str = UTF16.valueOf(lastCodePoint);
|
||||
String upper = showForm(out, str, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.UPPER), "\u2191");
|
||||
showForm(out, str, upper, null, Default.ucd().getCase(str,UCD.FULL,UCD.TITLE), "\u2195");
|
||||
String lower = showForm(out, str, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.LOWER), "\u2193");
|
||||
showForm(out, lower, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.FOLD), "\u2194");
|
||||
|
||||
String dc = Default.ucd().getDecompositionMapping(lastCodePoint);
|
||||
String nfd = showForm(out, dc, str, null, Default.nfd().normalize(lastCodePoint), "\u21DB");
|
||||
//String nfc = showForm(out, dc, null, Default.nfc().normalize(lastCodePoint), "\u21DB");
|
||||
String nfkd = showForm(out, dc, str, nfd, Default.nfkd().normalize(lastCodePoint), "\u21DD");
|
||||
|
||||
if (nfkd.equals(str)) {
|
||||
Set s = ti.getConfusables(lastCodePoint, "MA");
|
||||
if (s.size() > 1) {
|
||||
sortedSet.clear();
|
||||
for (Iterator it = s.iterator(); it.hasNext();) {
|
||||
sortedSet.add(Default.nfkd().normalize((String)it.next()));
|
||||
}
|
||||
sortedSet.remove(nfkd); // remove me
|
||||
for (Iterator it = sortedSet.iterator(); it.hasNext();) {
|
||||
String other = (String)it.next();
|
||||
if (nfkd.equals(Default.nfkd().normalize(other))) continue;
|
||||
out.println("<tr><td>\u00A0</td><td>\u00A0</td><td class='conf'>\u279F\u00A0"
|
||||
+ showTextConvertingHex(Utility.hex(other, 4, " + "), true)
|
||||
+ " "
|
||||
+ Default.ucd().getName(other, UCD.NORMAL, " + ").toLowerCase()
|
||||
// maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=')
|
||||
+ "</td></tr>");
|
||||
}
|
||||
}
|
||||
}
|
||||
lastCodePoint = -1;
|
||||
}
|
||||
|
||||
static Set sortedSet = new TreeSet(Collator.getInstance(ULocale.ENGLISH));
|
||||
|
||||
private static String showForm(PrintWriter out, String str, String str2, String str3, String transformed, String symbol) {
|
||||
if (!transformed.equals(str) && !transformed.equals(str2) && !transformed.equals(str3)) {
|
||||
out.println("<tr><td>\u00A0</td><td>\u00A0</td><td class='c'>" + symbol + "\u00A0"
|
||||
+ showTextConvertingHex(Utility.hex(transformed, 4, " + "), true)
|
||||
+ (UTF16.countCodePoint(transformed) != 1 ? "" :
|
||||
" " + Default.ucd().getName(transformed, UCD.NORMAL, " + ").toLowerCase())
|
||||
// maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=')
|
||||
+ "</td></tr>");
|
||||
}
|
||||
return transformed;
|
||||
}
|
||||
|
||||
static public String getHeading(String name) {
|
||||
int pos = name.lastIndexOf(" (");
|
||||
if (pos < 0) return name;
|
||||
return name.substring(0, pos);
|
||||
}
|
||||
|
||||
private static String maybeNameStyle(String string, boolean b) {
|
||||
if (b && string.equals(string.toUpperCase(Locale.ENGLISH))) return nameStyle(string);
|
||||
return string;
|
||||
}
|
||||
|
||||
|
||||
private static String nameStyle(String string) {
|
||||
// TODO Auto-generated method stub
|
||||
String result = "<i>" + Default.ucd().getCase(string, UCD.FULL, UCD.TITLE) + "</i>";
|
||||
// if it has any &xxx;, then restore them.
|
||||
int position = 0;
|
||||
while (true) {
|
||||
if (!escapeMatch.reset(result).find(position)) break;
|
||||
int start = escapeMatch.start();
|
||||
position = escapeMatch.end();
|
||||
result = result.substring(0,start)
|
||||
+ result.substring(start, position).toLowerCase()
|
||||
+ result.substring(position);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static Matcher escapeMatch = Pattern.compile("\\&[A-Z][a-z]*\\;").matcher("");
|
||||
|
||||
private static String showTextConvertingHex(String body, boolean addCharToHex) {
|
||||
body = BagFormatter.toHTML.transliterate(body);
|
||||
if (addCharToHex) {
|
||||
int position = 0;
|
||||
while (position < body.length()) {
|
||||
if (!findHex.reset(body).find(position)) break;
|
||||
position = findHex.end();
|
||||
int start = findHex.start();
|
||||
int len = position - start;
|
||||
if (len < 4 || len > 6) continue;
|
||||
int cp = Integer.parseInt(findHex.group(),16);
|
||||
if (cp > 0x10FFFF) continue;
|
||||
String insert = "\u00A0" + showChar(cp);
|
||||
String beginning = body.substring(0,start)
|
||||
+ "<code>" + body.substring(start, position) + "</code>"
|
||||
+ insert;
|
||||
body = beginning + body.substring(position);
|
||||
position = beginning.length();
|
||||
}
|
||||
}
|
||||
return body;
|
||||
}
|
||||
|
||||
static Matcher pointer = Pattern.compile("x \\((.*) - ([0-9A-F]+)\\)").matcher("");
|
||||
static Matcher pointer2 = Pattern.compile("x ([0-9A-F]{4,6})").matcher("");
|
||||
static Matcher findHex = Pattern.compile("[0-9A-F]+").matcher("");
|
||||
|
||||
private static String getOther(String body) {
|
||||
// of form: x (hyphenation point - 2027)
|
||||
// => arrow 2027 X hyphenation point
|
||||
int cp;
|
||||
String name = null;
|
||||
if (pointer.reset(body).matches()) {
|
||||
cp = Integer.parseInt(pointer.group(2),16);
|
||||
name = pointer.group(1);
|
||||
String name2 = Default.ucd().getName(cp);
|
||||
if (name2 == null) name2 = "<not a character>";
|
||||
if (!name.equalsIgnoreCase(name2)) {
|
||||
System.out.println("Mismatch in name for " + body + " in " + Utility.hex(lastCodePoint));
|
||||
System.out.println("\tName is: " + name2);
|
||||
}
|
||||
} else if (pointer2.reset(body).matches()) {
|
||||
cp = Integer.parseInt(pointer2.group(1),16);
|
||||
// name = UCharacter.getName(cp).toLowerCase();
|
||||
// System.out.println("Irregular format: " + body);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Bad format: " + body);
|
||||
}
|
||||
return "\u2192 " + Utility.hex(cp,4) /*+ " " + showChar(cp)*/ + (name != null ? " " + name : "");
|
||||
}
|
||||
|
||||
static String showChar(int cp) {
|
||||
if (usePicture.contains(cp)) {
|
||||
int rep = '\u2588';
|
||||
if (cp <= 0x20) rep = 0x2400 + cp;
|
||||
else if (cp == 0x7F) rep = 0x2421;
|
||||
return "<span class='inv'>" + (char)rep + "</span>";
|
||||
//String hex = Utility.hex(cp);
|
||||
//return "<img alt='" + hex + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + hex + "'>";
|
||||
}
|
||||
int type = Default.ucd().getCategory(cp);
|
||||
String result = BagFormatter.toHTML.transliterate(UTF16.valueOf(cp));
|
||||
if (type == UCD.Me || type == UCD.Mn) {
|
||||
result = "\u25CC" + result;
|
||||
} else if (rtl.contains(cp)) {
|
||||
result = "\u200E" + result + "\u200E";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
//static final UnicodeSet noname = new UnicodeSet("[[:ascii:][:ideographic:]]");
|
||||
static final Map hasNoNameCan = new TreeMap();
|
||||
static final Map hasNameCan = new TreeMap();
|
||||
static final Map hasNoNameComp = new TreeMap();
|
||||
static final Map hasNameComp = new TreeMap();
|
||||
|
||||
private static String checkCanonical(int codePoint, String body) {
|
||||
body = body.substring(2);
|
||||
if (lastDecompType != UCD.CANONICAL) {
|
||||
System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint));
|
||||
}
|
||||
String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint);
|
||||
String hexed = Utility.hex(lastDecomp, 4, " ");
|
||||
String hexed2 = hexed;
|
||||
if (UTF16.countCodePoint(lastDecomp) == 1) {
|
||||
hexed2 += " " + Default.ucd().getName(lastDecomp).toLowerCase();
|
||||
}
|
||||
if (hexed.equalsIgnoreCase(body)) {
|
||||
hasNoNameCan.put(lastDecomp, UTF16.valueOf(codePoint));
|
||||
} else if (hexed2.equalsIgnoreCase(body)) {
|
||||
hasNameCan.put(lastDecomp, UTF16.valueOf(codePoint));
|
||||
} else {
|
||||
System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint));
|
||||
System.out.println("\tShould be: " + hexed);
|
||||
}
|
||||
lastDecompType = UCD.NONE;
|
||||
return "\u2261 " + body;
|
||||
}
|
||||
|
||||
private static String checkCompatibility(int codePoint, String body) {
|
||||
body = body.substring(2);
|
||||
if (lastDecompType <= UCD.CANONICAL) {
|
||||
System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint));
|
||||
}
|
||||
String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint);
|
||||
String hexed = Utility.hex(lastDecomp, 4, " ");
|
||||
if (lastDecompType != UCD.COMPAT_UNSPECIFIED) {
|
||||
String lastDecompID = Default.ucd().getDecompositionTypeID(lastCodePoint);
|
||||
hexed = "<" + lastDecompID + "> " + hexed;
|
||||
}
|
||||
String hexed2 = hexed;
|
||||
if (UTF16.countCodePoint(lastDecomp) == 1) {
|
||||
hexed2 += " " + Default.ucd().getName(lastDecomp).toLowerCase();
|
||||
}
|
||||
if (hexed.equalsIgnoreCase(body)) {
|
||||
hasNoNameComp.put(lastDecomp, UTF16.valueOf(codePoint));
|
||||
} else if (hexed2.equalsIgnoreCase(body)) {
|
||||
hasNameComp.put(lastDecomp, UTF16.valueOf(codePoint));
|
||||
} else {
|
||||
System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint));
|
||||
System.out.println("\tShould be: " + hexed);
|
||||
}
|
||||
lastDecompType = UCD.NONE;
|
||||
return "\u2248 " + body;
|
||||
}
|
||||
|
||||
static class BlockInfo {
|
||||
BufferedReader in;
|
||||
String lastLine;
|
||||
BlockInfo (String version, String filename) throws IOException {
|
||||
in = Utility.openUnicodeFile(filename, version, true, Utility.LATIN1_WINDOWS);
|
||||
//in = BagFormatter.openUTF8Reader(dir, filename);
|
||||
}
|
||||
boolean next(List inout) throws IOException {
|
||||
inout.clear();
|
||||
if (lastLine != null) {
|
||||
inout.add(lastLine);
|
||||
lastLine = null;
|
||||
}
|
||||
while (true) {
|
||||
String line = in.readLine();
|
||||
if (line == null) break;
|
||||
if (line.startsWith("@@\t")) {
|
||||
lastLine = line;
|
||||
break;
|
||||
}
|
||||
inout.add(line);
|
||||
}
|
||||
return inout.size() > 0;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
|
||||
* $Date: 2004/04/17 18:21:39 $
|
||||
* $Revision: 1.16 $
|
||||
* $Date: 2005/11/01 00:10:54 $
|
||||
* $Revision: 1.17 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -136,7 +136,7 @@ public final class Normalizer implements UCD_Types {
|
|||
|
||||
/**
|
||||
* Normalizes text according to the chosen form
|
||||
* @param source the original text, unnormalized
|
||||
* @param newLocaleID the original text, unnormalized
|
||||
* @return target the resulting normalized text
|
||||
*/
|
||||
public String normalize(int cp) {
|
||||
|
@ -157,7 +157,7 @@ public final class Normalizer implements UCD_Types {
|
|||
/**
|
||||
* Does a quick check to see if the string is in the current form. Checks canonical order and
|
||||
* isAllowed().
|
||||
* @param source source text
|
||||
* @param newLocaleID source text
|
||||
* @return YES, NO, MAYBE
|
||||
*/
|
||||
/*
|
||||
|
|
|
@ -86,7 +86,7 @@ public class NormalizerSample implements UCD_Types {
|
|||
|
||||
/**
|
||||
* Normalizes text according to the chosen form
|
||||
* @param source the original text, unnormalized
|
||||
* @param newLocaleID the original text, unnormalized
|
||||
* @return target the resulting normalized text
|
||||
*/
|
||||
public String normalize(int cp) {
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $
|
||||
* $Date: 2005/10/11 19:39:15 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2005/11/01 00:10:54 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -21,6 +21,7 @@ import com.ibm.icu.dev.test.util.UnicodeMap;
|
|||
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
||||
import com.ibm.icu.dev.test.util.UnicodePropertySource;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap.MapIterator;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
@ -30,17 +31,92 @@ import com.ibm.text.utility.*;
|
|||
|
||||
public class QuickTest implements UCD_Types {
|
||||
public static void main(String[] args) throws IOException {
|
||||
getBidiMirrored();
|
||||
if (true) return;
|
||||
getLengths("NFC", Default.nfc());
|
||||
getLengths("NFD", Default.nfd());
|
||||
getLengths("NFKC", Default.nfkc());
|
||||
getLengths("NFKD", Default.nfkd());
|
||||
System.out.println("Done");
|
||||
try {
|
||||
//getBidiMirrored();
|
||||
getCaseFoldingUnstable();
|
||||
if (true) return;
|
||||
getHasAllNormalizations();
|
||||
getLengths("NFC", Default.nfc());
|
||||
getLengths("NFD", Default.nfd());
|
||||
getLengths("NFKC", Default.nfkc());
|
||||
getLengths("NFKD", Default.nfkd());
|
||||
} finally {
|
||||
System.out.println("Done");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private static void getCaseFoldingUnstable() {
|
||||
for (int i = 3; i < com.ibm.text.utility.Utility.searchPath.length - 1; ++i) {
|
||||
String newName = com.ibm.text.utility.Utility.searchPath[i];
|
||||
String oldName = com.ibm.text.utility.Utility.searchPath[i+1];
|
||||
showMemoryUsage();
|
||||
UCD ucdNew = UCD.make(newName);
|
||||
showMemoryUsage();
|
||||
UCD ucdOld = UCD.make(oldName);
|
||||
showMemoryUsage();
|
||||
UnicodeMap differences = new UnicodeMap();
|
||||
UnicodeSet differenceSet = new UnicodeSet();
|
||||
for (int j = 0; j < 0x10FFFF; ++j) {
|
||||
if (!ucdOld.isAssigned(j)) continue;
|
||||
String oldString = ucdOld.getCase(j, UCD.FULL, UCD.FOLD);
|
||||
String newString = ucdNew.getCase(j, UCD.FULL, UCD.FOLD);
|
||||
if (!oldString.equals(newString)) {
|
||||
differenceSet.add(j);
|
||||
differences.put(j, new String[]{oldString, newString});
|
||||
System.out.println(".");
|
||||
}
|
||||
}
|
||||
if (differenceSet.size() != 0) {
|
||||
System.out.println("Differences in " + com.ibm.text.utility.Utility.searchPath[i]);
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(differenceSet); it.next();) {
|
||||
System.out.println(ucdNew.getCodeAndName(it.codepoint));
|
||||
String[] strings = (String[]) differences.getValue(it.codepoint);
|
||||
System.out.println("\t" + oldName + ": " + ucdNew.getCodeAndName(strings[0]));
|
||||
System.out.println("\t" + newName + ": " + ucdNew.getCodeAndName(strings[1]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static public void showMemoryUsage() {
|
||||
System.gc(); System.gc(); System.gc(); System.gc();
|
||||
System.gc(); System.gc(); System.gc(); System.gc();
|
||||
System.gc(); System.gc(); System.gc(); System.gc();
|
||||
System.gc(); System.gc(); System.gc(); System.gc();
|
||||
System.out.println("total:\t" + Runtime.getRuntime().totalMemory() + ";\tfree:\t" +
|
||||
Runtime.getRuntime().freeMemory());
|
||||
}
|
||||
|
||||
private static void getHasAllNormalizations() {
|
||||
UnicodeSet items = new UnicodeSet();
|
||||
Set s = new LinkedHashSet();
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
if (!Default.ucd().isAssigned(i)) continue;
|
||||
if (Default.ucd().getDecompositionType(i) == UCD.NONE) continue;
|
||||
String source = UTF16.valueOf(i);
|
||||
String nfc = Default.nfc().normalize(source);
|
||||
String nfd = Default.nfd().normalize(source);
|
||||
String nfkd = Default.nfkd().normalize(source);
|
||||
String nfkc = Default.nfkc().normalize(source);
|
||||
s.clear();
|
||||
s.add(source);
|
||||
s.add(nfc);
|
||||
s.add(nfd);
|
||||
s.add(nfkd);
|
||||
s.add(nfkc);
|
||||
if (s.size() > 3) {
|
||||
System.out.println(Utility.hex(source) + "\t" + Utility.escape(source)
|
||||
+ "\t" + Default.ucd().getName(source)
|
||||
+ "\tnfd\t" + Utility.hex(nfd) + "\t" + Utility.escape(nfd)
|
||||
+ "\tnfc\t" + Utility.hex(nfc) + "\t" + Utility.escape(nfc)
|
||||
+ "\tnfkd\t" + Utility.hex(nfkd) + "\t" + Utility.escape(nfkd)
|
||||
+ "\tnfkc\t" + Utility.hex(nfkc) + "\t" + Utility.escape(nfkc));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private static void getBidiMirrored() {
|
||||
ToolUnicodePropertySource foo = ToolUnicodePropertySource.make("");
|
||||
UnicodeMap status = new UnicodeMap();
|
||||
|
@ -92,9 +168,10 @@ public class QuickTest implements UCD_Types {
|
|||
UnicodeSet set = status.getSet(value);
|
||||
for (UnicodeSetIterator umi = new UnicodeSetIterator(set); umi.next();) {
|
||||
System.out.println(Utility.hex(umi.codepoint)
|
||||
+ ";\t" + value
|
||||
+ ";\t" + (x.contains(umi.codepoint) ? "O" : "")
|
||||
+ ";\t" + Default.ucd().getName(umi.codepoint));
|
||||
+ (value.startsWith("*") ? ";\tBidi_Mirrored" : "")
|
||||
+ "\t#\t" + value
|
||||
//+ ";\t" + (x.contains(umi.codepoint) ? "O" : "")
|
||||
+ "\t" + Default.ucd().getName(umi.codepoint));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -288,6 +365,6 @@ public class QuickTest implements UCD_Types {
|
|||
System.out.println("\tCount:" + set1.size());
|
||||
System.out.println("\tSet:" + set1.toPattern(true));
|
||||
System.out.println("\tDetails:");
|
||||
Utility.showSetNames("", set1, false, Default.ucd());
|
||||
//Utility.showSetNames("", set1, false, Default.ucd());
|
||||
}
|
||||
}
|
|
@ -4,10 +4,15 @@ import java.io.BufferedReader;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.XEquivalenceClass;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.Normalizer;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
@ -34,6 +39,14 @@ public class TestIdentifiers {
|
|||
System.out.print(folded);
|
||||
ti.testItem(folded);
|
||||
}
|
||||
for (int j = 0; j < tests[i].length(); ++j) {
|
||||
int cp = tests[i].charAt(j);
|
||||
Set s = ti.getConfusables(cp, "MA");
|
||||
System.out.println(Default.ucd().getCodeAndName(cp));
|
||||
for (Iterator it = s.iterator(); it.hasNext();) {
|
||||
System.out.println("\t= " + Default.ucd().getCodeAndName((String)it.next()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -141,6 +154,49 @@ public class TestIdentifiers {
|
|||
}
|
||||
br.close();
|
||||
}
|
||||
|
||||
Map type_equivalences;
|
||||
|
||||
void loadConfusables() throws IOException {
|
||||
BufferedReader br = BagFormatter.openUTF8Reader(indir,
|
||||
"confusables.txt");
|
||||
String line = null;
|
||||
type_equivalences = new HashMap();
|
||||
try {
|
||||
while (true) {
|
||||
line = Utility.readDataLine(br);
|
||||
if (line == null)
|
||||
break;
|
||||
if (line.length() == 0)
|
||||
continue;
|
||||
String[] pieces = Utility.split(line, ';');
|
||||
// part 0 is source code point
|
||||
String s = Utility.fromHex(pieces[0].trim());
|
||||
// part 1 is script1
|
||||
String t = Utility.fromHex(pieces[1].trim());
|
||||
|
||||
String type = pieces[2].trim();
|
||||
XEquivalenceClass ec = (XEquivalenceClass) type_equivalences.get(type);
|
||||
if (ec == null) type_equivalences.put(type, ec = new XEquivalenceClass(""));
|
||||
ec.add(s, t);
|
||||
//System.out.println(type + ": " + Default.ucd().getCodeAndName(s) + " => " + Default.ucd().getCodeAndName(t));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw (RuntimeException) new RuntimeException("Failure on line "
|
||||
+ line).initCause(e);
|
||||
}
|
||||
br.close();
|
||||
}
|
||||
|
||||
public Set getConfusables(int cp, String type) {
|
||||
try {
|
||||
if (type_equivalences == null) loadConfusables();
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
XEquivalenceClass ec = (XEquivalenceClass) type_equivalences.get(type);
|
||||
return ec.getEquivalences(UTF16.valueOf(cp));
|
||||
}
|
||||
|
||||
void loadWholeScriptConfusables(String filterType) throws IOException {
|
||||
UnicodeSet[][] script_script_set = new UnicodeSet[UScript.CODE_LIMIT][UScript.CODE_LIMIT];
|
||||
|
|
|
@ -73,7 +73,7 @@ public class TestUnicodeInvariants {
|
|||
int variableCount = 0;
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
|
||||
out.write('\uFEFF'); // BOM
|
||||
BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt");
|
||||
BufferedReader in = BagFormatter.openUTF8Reader("com/ibm/text/UCD/", "UnicodeInvariants.txt");
|
||||
BagFormatter bf = new BagFormatter();
|
||||
bf.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
|
||||
BagFormatter bf2 = new BagFormatter();
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2005/05/02 15:39:53 $
|
||||
* $Revision: 1.39 $
|
||||
* $Date: 2005/11/01 00:10:54 $
|
||||
* $Revision: 1.40 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -43,7 +43,7 @@ public final class UCD implements UCD_Types {
|
|||
/**
|
||||
* Used for the default version.
|
||||
*/
|
||||
public static final String latestVersion = "4.1.0";
|
||||
public static final String latestVersion = "5.1.0";
|
||||
|
||||
/**
|
||||
* Create singleton instance for default (latest) version
|
||||
|
@ -158,12 +158,16 @@ public final class UCD implements UCD_Types {
|
|||
* Get the character names for the code points in a string, separated by ", "
|
||||
*/
|
||||
public String getName(String s, byte style) {
|
||||
return getName(s, style, ", ");
|
||||
}
|
||||
|
||||
public String getName(String s, byte style, String separator) {
|
||||
if (s.length() == 1) return getName(s.charAt(0), style); // optimize BMP
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(s, i);
|
||||
if (i > 0) result.append(", ");
|
||||
if (i > 0) result.append(separator);
|
||||
result.append(getName(cp, style));
|
||||
}
|
||||
return result.toString();
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
|
||||
* $Date: 2005/03/10 02:37:20 $
|
||||
* $Revision: 1.31 $
|
||||
* $Date: 2005/11/01 00:10:54 $
|
||||
* $Revision: 1.32 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -15,7 +15,7 @@ package com.ibm.text.UCD;
|
|||
|
||||
public interface UCD_Types {
|
||||
|
||||
static final byte BINARY_FORMAT = 16; // bumped if binary format of UCD changes. Forces rebuild
|
||||
static final byte BINARY_FORMAT = 17; // bumped if binary format of UCD changes. Forces rebuild
|
||||
|
||||
public static final String BASE_DIR = "C:\\DATA\\";
|
||||
public static final String UCD_DIR = BASE_DIR + "UCD\\";
|
||||
|
|
|
@ -111,6 +111,21 @@ $XID_Continue ! [$Pattern_Whitespace $Pattern_Syntax]
|
|||
$Pattern_Whitespace ! [$XID_Continue $Pattern_Syntax]
|
||||
$Pattern_Syntax ! [$XID_Continue $Pattern_Whitespace]
|
||||
|
||||
# Test SA characters
|
||||
|
||||
# They are limited to certain scripts:
|
||||
Let $SAScripts = [$script:thai $script:lao $script:myanmar $script:khmer]
|
||||
$SAScripts ⊇ $LineBreak:SA
|
||||
|
||||
# And in those scripts, they are all the alphabetic spacing characters, plus some odd Cf
|
||||
[$SAScripts & [$Alphabetic $gc:cf]] = [$SAScripts & [$LineBreak:SA $LineBreak:CM]]
|
||||
|
||||
# Try removing M* from alphabetic, and matching to SA
|
||||
[$SAScripts & [$Alphabetic $gc:cf - $gcAllMarks]] = $LineBreak:SA
|
||||
|
||||
# Try adding M* to alphabetic, and matching to SA
|
||||
[$SAScripts & [$Alphabetic $gc:cf $gcAllMarks]] = $LineBreak:SA
|
||||
|
||||
# testing
|
||||
# [$Pattern_Whitespace $Pattern_Syntax] ! [[^$WB:Format $WB:Other] \u2019 \u0027 \u02BC \u002d \u00ad \u2027 \u058A]
|
||||
Let $otherword = [\u2019 \u0027 \u02BC \u002d \u00ad \u2027 \u058A]
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/ChainException.java,v $
|
||||
* $Date: 2001/12/06 00:05:52 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2005/11/01 00:10:53 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -20,7 +20,7 @@ import java.io.*;
|
|||
public class ChainException extends RuntimeException {
|
||||
Object[] keyData;
|
||||
String messageFormat;
|
||||
Exception chain;
|
||||
//Exception chain;
|
||||
|
||||
public ChainException (String messageFormat, Object[] objects) {
|
||||
this.messageFormat = messageFormat;
|
||||
|
@ -30,20 +30,20 @@ public class ChainException extends RuntimeException {
|
|||
public ChainException (String messageFormat, Object[] objects, Exception chainedException) {
|
||||
this.messageFormat = messageFormat;
|
||||
keyData = objects == null ? null : (Object[]) objects.clone();
|
||||
chain = chainedException;
|
||||
initCause(chainedException);
|
||||
}
|
||||
|
||||
public String getMessage() {
|
||||
String chainMsg = "";
|
||||
if (chain != null) {
|
||||
chainMsg = "; " + chain.getClass().getName()
|
||||
+ ", " + chain.getMessage();
|
||||
StringWriter w = new StringWriter();
|
||||
PrintWriter p = new PrintWriter(w);
|
||||
chain.printStackTrace(p);
|
||||
chainMsg += ", " + w.getBuffer();
|
||||
p.close();
|
||||
}
|
||||
// if (chain != null) {
|
||||
// chainMsg = "; " + chain.getClass().getName()
|
||||
// + ", " + chain.getMessage();
|
||||
// StringWriter w = new StringWriter();
|
||||
// PrintWriter p = new PrintWriter(w);
|
||||
// chain.printStackTrace(p);
|
||||
// chainMsg += ", " + w.getBuffer();
|
||||
// p.close();
|
||||
// }
|
||||
String main = "";
|
||||
if (keyData != null) main = MessageFormat.format(messageFormat, keyData);
|
||||
return main + chainMsg;
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
|
||||
* $Date: 2005/06/21 21:28:31 $
|
||||
* $Revision: 1.50 $
|
||||
* $Date: 2005/11/01 00:10:53 $
|
||||
* $Revision: 1.51 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -700,8 +700,9 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
|||
return result + "}";
|
||||
}
|
||||
|
||||
private static final String[] searchPath = {
|
||||
public static final String[] searchPath = {
|
||||
"EXTRAS",
|
||||
"5.0.0",
|
||||
"4.1.0",
|
||||
"4.0.1",
|
||||
"4.0.0",
|
||||
|
|
Loading…
Add table
Reference in a new issue