fixes for collation charts

X-SVN-Rev: 6436
This commit is contained in:
Mark Davis 2001-10-25 20:35:42 +00:00
parent 815bc3f18a
commit ab4045e909
4 changed files with 187 additions and 33 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $
* $Date: 2001/09/19 23:32:21 $
* $Revision: 1.4 $
* $Date: 2001/10/25 20:35:42 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -30,6 +30,33 @@ public class GenOverlap implements UCD_Types {
static Normalizer nfd;
static Normalizer nfkd;
public static void validateUCA(UCA collatorIn) throws Exception {
collator = collatorIn;
ucd = UCD.make();
nfd = new Normalizer(Normalizer.NFD);
nfkd = new Normalizer(Normalizer.NFKD);
for (int cp = 0x0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
if (!ucd.isRepresented(cp)) continue;
byte decompType = ucd.getDecompositionType(cp);
if (decompType >= UCD.COMPATIBILITY) {
String decomp = nfkd.normalize(cp);
CEList celistDecomp = getCEList(cp, decomp, true, decompType);
CEList celistNormal = getCEList(UTF16.valueOf(cp), false);
if (!celistNormal.equals(celistDecomp)) {
Utility.fixDot();
System.out.println();
System.out.println(ucd.getCodeAndName(cp));
System.out.println(celistNormal);
System.out.println(celistDecomp);
}
}
}
}
public static void test(UCA collatorIn) throws Exception {
collator = collatorIn;
@ -68,7 +95,7 @@ public class GenOverlap implements UCD_Types {
byte decompType = ucd.getDecompositionType(cp);
if (decompType >= UCD.COMPATIBILITY) {
String decomp = nfkd.normalize(cp);
CEList celist = getCEList(cp, decomp, decompType);
CEList celist = getCEList(cp, decomp, true, decompType);
addString(decomp, celist);
System.out.println("Adding: " + ucd.getCodeAndName(cp) + "\t" + celist);
}
@ -182,16 +209,22 @@ public class GenOverlap implements UCD_Types {
}
static private CEList getCEList(String s) {
int len = collator.getCEs(s, true, ces);
return getCEList(s, true);
}
static private CEList getCEList(String s, boolean decomp) {
int len = collator.getCEs(s, decomp, ces);
return new CEList(ces, 0, len);
}
static private CEList getCEList(int originalChar, String s, byte type) {
int len = collator.getCEs(s, true, ces);
for (int i = 0; i < len; ++i) {
ces[i] = UCA.makeKey(UCA.getPrimary(ces[i]),
UCA.getSecondary(ces[i]),
CEList.remap(originalChar, type, UCA.getTertiary(ces[i])));
static private CEList getCEList(int originalChar, String s, boolean decomp, byte type) {
int len = collator.getCEs(s, decomp, ces);
if (decomp) {
for (int i = 0; i < len; ++i) {
ces[i] = UCA.makeKey(UCA.getPrimary(ces[i]),
UCA.getSecondary(ces[i]),
CEList.remap(originalChar, type, UCA.getTertiary(ces[i])));
}
}
return new CEList(ces, 0, len);
}
@ -290,7 +323,7 @@ public class GenOverlap implements UCD_Types {
}
public static void generateRevision (UCA collatorIn) throws Exception {
generateRevision(collatorIn, false);
//generateRevision(collatorIn, false);
generateRevision(collatorIn, true);
}
@ -336,7 +369,7 @@ public class GenOverlap implements UCD_Types {
int cp;
for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(str, i);
if (0xFF67 <= cp && cp <= 0xFF6F) {
if (0xFF3F == cp) {
System.out.println("debug");
}
boolean mashLast = false;
@ -351,7 +384,7 @@ public class GenOverlap implements UCD_Types {
int s = UCA.getSecondary(ces[j]);
boolean needsFix = (s != 0x20 && p != 0);
if (needsFix) ++len;
int t = (doMax && len > 1 && j == len-1 ? 0x1F : CEList.remap(cp, type, UCA.getTertiary(ces[j])));
int t = (doMax && j > 0 ? 0x1F : CEList.remap(cp, type, UCA.getTertiary(ces[j])));
if (needsFix) {
ces[j++] = UCA.makeKey(p, 0x20, t); // Set Extra
System.arraycopy(ces, j, ces, j+1, len - j); // Insert HOLE!
@ -413,7 +446,7 @@ public class GenOverlap implements UCD_Types {
newKeys.removeAll(joint);
oldKeys.removeAll(joint);
PrintWriter log = Utility.openPrintWriter("UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"));
PrintWriter log = Utility.openPrintWriter("UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), false);
Iterator it = list.iterator();
int last = -1;
while (it.hasNext()) {
@ -657,4 +690,51 @@ public class GenOverlap implements UCD_Types {
+ "</td><td align='right'>" + nf.format(sd)
+ "</td></tr>");
}
public static void listCyrillic(UCA collatorIn) throws IOException {
PrintWriter log = Utility.openPrintWriter("ListCyrillic.txt", false);
Set set = new TreeSet(collatorIn);
Set set2 = new TreeSet(collatorIn);
ucd = UCD.make();
nfd = new Normalizer(Normalizer.NFD);
for (char i = 0; i < 0xFFFF; ++i) {
Utility.dot(i);
if (!ucd.isRepresented(i)) continue;
if (ucd.getScript(i) != CYRILLIC_SCRIPT) continue;
String decomp = nfd.normalize(String.valueOf(i));
String oldDecomp = decomp;
for (int j = 0; j < decomp.length(); ++j) {
if (ucd.getCategory(decomp.charAt(j)) == Mn) {
decomp = decomp.substring(0,j) + decomp.substring(j+1);
}
}
if (decomp.length() == 0) continue;
set.add(decomp);
if (!decomp.equals(oldDecomp)) set2.add(oldDecomp);
}
Iterator it = set.iterator();
while (it.hasNext()) {
String s = (String) it.next();
String name = ucd.getName(s.charAt(0));
Utility.replace(name, "CYRILLIC ", "");
log.println("# " + s + " <> XXX ; # " + name);
}
it = set2.iterator();
while (it.hasNext()) {
String s = (String) it.next();
String name = ucd.getName(s.charAt(0));
Utility.replace(name, "CYRILLIC ", "");
log.println("### " + s + " <> XXX ; # " + name);
}
log.close();
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
* $Date: 2001/09/19 23:32:21 $
* $Revision: 1.4 $
* $Date: 2001/10/25 20:35:41 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -61,9 +61,13 @@ This is because of shared
characters between scripts with different directions, like French with Arabic or Greek.
*/
final public class UCA {
final public class UCA implements Comparator {
public static final String copyright =
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
public int compare(Object a, Object b) {
return getSortKey((String) a).compareTo(getSortKey((String) b));
}
/**
* Version of the UCA tables to use

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $
* $Date: 2001/09/19 23:31:50 $
* $Revision: 1.1 $
* $Date: 2001/10/25 20:35:41 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
@ -30,6 +30,7 @@ public class WriteCharts implements UCD_Types {
ucd = UCD.make();
Normalizer nfd = new Normalizer(Normalizer.NFD);
Normalizer nfc = new Normalizer(Normalizer.NFC);
UCA.UCAContents cc = uca.getContents(UCA.FIXED_CE, null); // nfd instead of null if skipping decomps
@ -53,18 +54,23 @@ public class WriteCharts implements UCD_Types {
int lastPrimary = -1;
String lastSortKey = null;
String lastSortKey = "\u0000";
int high = uca.getSortKey("a").charAt(0);
int variable = UCA.getPrimary(uca.getVariableHigh());
int columnCount = 0;
Utility.copyTextFile("index.html", true, "CollationCharts\\index.html");
Utility.copyTextFile("charts.css", false, "CollationCharts\\charts.css");
Utility.copyTextFile("help.html", true, "CollationCharts\\help.html");
indexFile = Utility.openPrintWriter("CollationCharts\\index_list.html");
indexFile.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
indexFile.println("<title>UCA Default Collation Table</title>");
indexFile.println("<base target='main'>");
indexFile.println("<style><!-- p { font-size: 90% } --></style>");
indexFile.println("</head><body><h2 align='center'>UCA Default Collation Table</h2>");
indexFile.println("<p align='center'><a href = 'help.html'>Help</a>");
@ -102,19 +108,31 @@ public class WriteCharts implements UCD_Types {
oldScript = script;
}
int strength = 6;
if (lastSortKey != null && sortKey.charAt(0) == lastSortKey.charAt(0)) {
strength = uca.strengthDifference(sortKey, lastSortKey);
if (strength < 0) strength = -strength;
}
boolean firstPrimaryEquals = sortKey.charAt(0) == lastSortKey.charAt(0);
int strength = uca.strengthDifference(sortKey, lastSortKey);
if (strength < 0) strength = -strength;
lastSortKey = sortKey;
// find out if this is an expansion: more than one primary weight
int primaryCount = 0;
for (int i = 0; i < sortKey.length(); ++i) {
char w = sortKey.charAt(i);
if (w == 0) break;
++ primaryCount;
}
String breaker = "";
if (columnCount > 10 || strength > 5) {
if (strength <= 5) breaker = "</tr><tr><td></td>";
else breaker = "</tr><tr>";
if (columnCount > 10 || !firstPrimaryEquals) {
if (!firstPrimaryEquals) breaker = "</tr><tr>";
else breaker = "</tr><tr><td></td>"; // indent 1 cell
columnCount = 0;
}
output.println(breaker + CLASSNAME[strength] + s
String classname = primaryCount > 1 ? XCLASSNAME[strength] : CLASSNAME[strength];
output.println(breaker + classname + nfc.normalize(s)
+ "<br><tt>" + Utility.hex(s)
//+ "<br>" + script
//+ "<br>" + UCA.toString(sortKey)
@ -133,8 +151,15 @@ public class WriteCharts implements UCD_Types {
"<td class='q'>",
"<td class='t'>",
"<td class='s'>",
"<td class='p'>",
"<td class='f'>"};
"<td class='p'>"};
static final String[] XCLASSNAME = {
"<td class='eq'>",
"<td class='eq'>",
"<td class='eq'>",
"<td class='et'>",
"<td class='es'>",
"<td class='ep'>"};
static PrintWriter indexFile;

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
* $Date: 2001/09/19 23:32:21 $
* $Revision: 1.4 $
* $Date: 2001/10/25 20:35:41 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -63,9 +63,13 @@ public class WriteCollationData implements UCD_Types {
String arg = args[i];
if (arg.equalsIgnoreCase("WriteRulesWithNames")) writeRules(WITH_NAMES);
else if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(collator);
else if (arg.equalsIgnoreCase("validateUCA")) GenOverlap.validateUCA(collator);
else if (arg.equalsIgnoreCase("writeNonspacingDifference")) writeNonspacingDifference();
else if (arg.equalsIgnoreCase("WriteCharts")) WriteCharts.test(collator);
else if (arg.equalsIgnoreCase("CheckHash")) GenOverlap.checkHash(collator);
else if (arg.equalsIgnoreCase("generateRevision")) GenOverlap.generateRevision(collator);
else if (arg.equalsIgnoreCase("listCyrillic")) GenOverlap.listCyrillic(collator);
else if (arg.equalsIgnoreCase("WriteRules")) writeRules(WITHOUT_NAMES);
else if (arg.equalsIgnoreCase("WriteRulesXML")) writeRules(IN_XML);
@ -748,6 +752,47 @@ public class WriteCollationData implements UCD_Types {
return len;
}
static void writeNonspacingDifference() throws IOException {
PrintWriter diLog = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(GEN_DIR + "UCA_Nonspacing.txt"),
"UTF8"),
32*1024));
diLog.write('\uFEFF');
Normalizer nfd = new Normalizer(Normalizer.NFD);
Set sorted = new TreeSet();
for (int i = 0; i < 0x10FFFF; ++i) {
Utility.dot(i);
if (!ucd.isRepresented(i)) continue;
byte cat = ucd.getCategory(i);
boolean isNonSpacing = cat == Mn || cat == Me;
CEList celist = collator.getCEList(UTF32.valueOf32(i), true);
boolean isPrimaryIgnorable = true;
for (int j = 0; j < celist.length(); ++j) {
int ce = celist.at(j);
int primary = collator.getPrimary(ce);
if (primary != 0) {
isPrimaryIgnorable = false;
break;
}
}
if (isNonSpacing != isPrimaryIgnorable) {
sorted.add(ucd.getCategoryID(i)
+ "\t" + celist
+ "\t" + ucd.getCodeAndName(i));
}
}
Utility.print(diLog, sorted, "\r\n");
diLog.close();
}
static void writeContractions() throws IOException {
PrintWriter diLog = new PrintWriter(
new BufferedWriter(