mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
fixes for collation charts
X-SVN-Rev: 6436
This commit is contained in:
parent
815bc3f18a
commit
ab4045e909
4 changed files with 187 additions and 33 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $
|
||||
* $Date: 2001/09/19 23:32:21 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2001/10/25 20:35:42 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -30,6 +30,33 @@ public class GenOverlap implements UCD_Types {
|
|||
static Normalizer nfd;
|
||||
static Normalizer nfkd;
|
||||
|
||||
public static void validateUCA(UCA collatorIn) throws Exception {
|
||||
collator = collatorIn;
|
||||
ucd = UCD.make();
|
||||
|
||||
nfd = new Normalizer(Normalizer.NFD);
|
||||
nfkd = new Normalizer(Normalizer.NFKD);
|
||||
|
||||
for (int cp = 0x0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
if (!ucd.isRepresented(cp)) continue;
|
||||
byte decompType = ucd.getDecompositionType(cp);
|
||||
if (decompType >= UCD.COMPATIBILITY) {
|
||||
String decomp = nfkd.normalize(cp);
|
||||
CEList celistDecomp = getCEList(cp, decomp, true, decompType);
|
||||
CEList celistNormal = getCEList(UTF16.valueOf(cp), false);
|
||||
if (!celistNormal.equals(celistDecomp)) {
|
||||
Utility.fixDot();
|
||||
System.out.println();
|
||||
System.out.println(ucd.getCodeAndName(cp));
|
||||
System.out.println(celistNormal);
|
||||
System.out.println(celistDecomp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static void test(UCA collatorIn) throws Exception {
|
||||
collator = collatorIn;
|
||||
|
||||
|
@ -68,7 +95,7 @@ public class GenOverlap implements UCD_Types {
|
|||
byte decompType = ucd.getDecompositionType(cp);
|
||||
if (decompType >= UCD.COMPATIBILITY) {
|
||||
String decomp = nfkd.normalize(cp);
|
||||
CEList celist = getCEList(cp, decomp, decompType);
|
||||
CEList celist = getCEList(cp, decomp, true, decompType);
|
||||
addString(decomp, celist);
|
||||
System.out.println("Adding: " + ucd.getCodeAndName(cp) + "\t" + celist);
|
||||
}
|
||||
|
@ -182,16 +209,22 @@ public class GenOverlap implements UCD_Types {
|
|||
}
|
||||
|
||||
static private CEList getCEList(String s) {
|
||||
int len = collator.getCEs(s, true, ces);
|
||||
return getCEList(s, true);
|
||||
}
|
||||
|
||||
static private CEList getCEList(String s, boolean decomp) {
|
||||
int len = collator.getCEs(s, decomp, ces);
|
||||
return new CEList(ces, 0, len);
|
||||
}
|
||||
|
||||
static private CEList getCEList(int originalChar, String s, byte type) {
|
||||
int len = collator.getCEs(s, true, ces);
|
||||
for (int i = 0; i < len; ++i) {
|
||||
ces[i] = UCA.makeKey(UCA.getPrimary(ces[i]),
|
||||
UCA.getSecondary(ces[i]),
|
||||
CEList.remap(originalChar, type, UCA.getTertiary(ces[i])));
|
||||
static private CEList getCEList(int originalChar, String s, boolean decomp, byte type) {
|
||||
int len = collator.getCEs(s, decomp, ces);
|
||||
if (decomp) {
|
||||
for (int i = 0; i < len; ++i) {
|
||||
ces[i] = UCA.makeKey(UCA.getPrimary(ces[i]),
|
||||
UCA.getSecondary(ces[i]),
|
||||
CEList.remap(originalChar, type, UCA.getTertiary(ces[i])));
|
||||
}
|
||||
}
|
||||
return new CEList(ces, 0, len);
|
||||
}
|
||||
|
@ -290,7 +323,7 @@ public class GenOverlap implements UCD_Types {
|
|||
}
|
||||
|
||||
public static void generateRevision (UCA collatorIn) throws Exception {
|
||||
generateRevision(collatorIn, false);
|
||||
//generateRevision(collatorIn, false);
|
||||
generateRevision(collatorIn, true);
|
||||
}
|
||||
|
||||
|
@ -336,7 +369,7 @@ public class GenOverlap implements UCD_Types {
|
|||
int cp;
|
||||
for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(str, i);
|
||||
if (0xFF67 <= cp && cp <= 0xFF6F) {
|
||||
if (0xFF3F == cp) {
|
||||
System.out.println("debug");
|
||||
}
|
||||
boolean mashLast = false;
|
||||
|
@ -351,7 +384,7 @@ public class GenOverlap implements UCD_Types {
|
|||
int s = UCA.getSecondary(ces[j]);
|
||||
boolean needsFix = (s != 0x20 && p != 0);
|
||||
if (needsFix) ++len;
|
||||
int t = (doMax && len > 1 && j == len-1 ? 0x1F : CEList.remap(cp, type, UCA.getTertiary(ces[j])));
|
||||
int t = (doMax && j > 0 ? 0x1F : CEList.remap(cp, type, UCA.getTertiary(ces[j])));
|
||||
if (needsFix) {
|
||||
ces[j++] = UCA.makeKey(p, 0x20, t); // Set Extra
|
||||
System.arraycopy(ces, j, ces, j+1, len - j); // Insert HOLE!
|
||||
|
@ -413,7 +446,7 @@ public class GenOverlap implements UCD_Types {
|
|||
newKeys.removeAll(joint);
|
||||
oldKeys.removeAll(joint);
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter("UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"));
|
||||
PrintWriter log = Utility.openPrintWriter("UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), false);
|
||||
Iterator it = list.iterator();
|
||||
int last = -1;
|
||||
while (it.hasNext()) {
|
||||
|
@ -657,4 +690,51 @@ public class GenOverlap implements UCD_Types {
|
|||
+ "</td><td align='right'>" + nf.format(sd)
|
||||
+ "</td></tr>");
|
||||
}
|
||||
|
||||
public static void listCyrillic(UCA collatorIn) throws IOException {
|
||||
PrintWriter log = Utility.openPrintWriter("ListCyrillic.txt", false);
|
||||
Set set = new TreeSet(collatorIn);
|
||||
Set set2 = new TreeSet(collatorIn);
|
||||
ucd = UCD.make();
|
||||
|
||||
nfd = new Normalizer(Normalizer.NFD);
|
||||
|
||||
for (char i = 0; i < 0xFFFF; ++i) {
|
||||
Utility.dot(i);
|
||||
if (!ucd.isRepresented(i)) continue;
|
||||
if (ucd.getScript(i) != CYRILLIC_SCRIPT) continue;
|
||||
|
||||
String decomp = nfd.normalize(String.valueOf(i));
|
||||
String oldDecomp = decomp;
|
||||
for (int j = 0; j < decomp.length(); ++j) {
|
||||
if (ucd.getCategory(decomp.charAt(j)) == Mn) {
|
||||
decomp = decomp.substring(0,j) + decomp.substring(j+1);
|
||||
}
|
||||
}
|
||||
if (decomp.length() == 0) continue;
|
||||
|
||||
set.add(decomp);
|
||||
if (!decomp.equals(oldDecomp)) set2.add(oldDecomp);
|
||||
}
|
||||
|
||||
Iterator it = set.iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
String name = ucd.getName(s.charAt(0));
|
||||
Utility.replace(name, "CYRILLIC ", "");
|
||||
log.println("# " + s + " <> XXX ; # " + name);
|
||||
}
|
||||
|
||||
it = set2.iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
String name = ucd.getName(s.charAt(0));
|
||||
Utility.replace(name, "CYRILLIC ", "");
|
||||
log.println("### " + s + " <> XXX ; # " + name);
|
||||
}
|
||||
|
||||
log.close();
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
|
||||
* $Date: 2001/09/19 23:32:21 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2001/10/25 20:35:41 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -61,9 +61,13 @@ This is because of shared
|
|||
characters between scripts with different directions, like French with Arabic or Greek.
|
||||
*/
|
||||
|
||||
final public class UCA {
|
||||
final public class UCA implements Comparator {
|
||||
public static final String copyright =
|
||||
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
|
||||
|
||||
public int compare(Object a, Object b) {
|
||||
return getSortKey((String) a).compareTo(getSortKey((String) b));
|
||||
}
|
||||
|
||||
/**
|
||||
* Version of the UCA tables to use
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $
|
||||
* $Date: 2001/09/19 23:31:50 $
|
||||
* $Revision: 1.1 $
|
||||
* $Date: 2001/10/25 20:35:41 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -30,6 +30,7 @@ public class WriteCharts implements UCD_Types {
|
|||
|
||||
ucd = UCD.make();
|
||||
Normalizer nfd = new Normalizer(Normalizer.NFD);
|
||||
Normalizer nfc = new Normalizer(Normalizer.NFC);
|
||||
|
||||
UCA.UCAContents cc = uca.getContents(UCA.FIXED_CE, null); // nfd instead of null if skipping decomps
|
||||
|
||||
|
@ -53,18 +54,23 @@ public class WriteCharts implements UCD_Types {
|
|||
|
||||
int lastPrimary = -1;
|
||||
|
||||
String lastSortKey = null;
|
||||
String lastSortKey = "\u0000";
|
||||
|
||||
int high = uca.getSortKey("a").charAt(0);
|
||||
int variable = UCA.getPrimary(uca.getVariableHigh());
|
||||
|
||||
int columnCount = 0;
|
||||
|
||||
Utility.copyTextFile("index.html", true, "CollationCharts\\index.html");
|
||||
Utility.copyTextFile("charts.css", false, "CollationCharts\\charts.css");
|
||||
Utility.copyTextFile("help.html", true, "CollationCharts\\help.html");
|
||||
|
||||
indexFile = Utility.openPrintWriter("CollationCharts\\index_list.html");
|
||||
|
||||
indexFile.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
indexFile.println("<title>UCA Default Collation Table</title>");
|
||||
indexFile.println("<base target='main'>");
|
||||
indexFile.println("<style><!-- p { font-size: 90% } --></style>");
|
||||
indexFile.println("</head><body><h2 align='center'>UCA Default Collation Table</h2>");
|
||||
indexFile.println("<p align='center'><a href = 'help.html'>Help</a>");
|
||||
|
||||
|
@ -102,19 +108,31 @@ public class WriteCharts implements UCD_Types {
|
|||
oldScript = script;
|
||||
}
|
||||
|
||||
int strength = 6;
|
||||
if (lastSortKey != null && sortKey.charAt(0) == lastSortKey.charAt(0)) {
|
||||
strength = uca.strengthDifference(sortKey, lastSortKey);
|
||||
if (strength < 0) strength = -strength;
|
||||
}
|
||||
boolean firstPrimaryEquals = sortKey.charAt(0) == lastSortKey.charAt(0);
|
||||
|
||||
int strength = uca.strengthDifference(sortKey, lastSortKey);
|
||||
if (strength < 0) strength = -strength;
|
||||
lastSortKey = sortKey;
|
||||
|
||||
// find out if this is an expansion: more than one primary weight
|
||||
|
||||
int primaryCount = 0;
|
||||
for (int i = 0; i < sortKey.length(); ++i) {
|
||||
char w = sortKey.charAt(i);
|
||||
if (w == 0) break;
|
||||
++ primaryCount;
|
||||
}
|
||||
|
||||
String breaker = "";
|
||||
if (columnCount > 10 || strength > 5) {
|
||||
if (strength <= 5) breaker = "</tr><tr><td></td>";
|
||||
else breaker = "</tr><tr>";
|
||||
if (columnCount > 10 || !firstPrimaryEquals) {
|
||||
if (!firstPrimaryEquals) breaker = "</tr><tr>";
|
||||
else breaker = "</tr><tr><td></td>"; // indent 1 cell
|
||||
columnCount = 0;
|
||||
}
|
||||
output.println(breaker + CLASSNAME[strength] + s
|
||||
|
||||
String classname = primaryCount > 1 ? XCLASSNAME[strength] : CLASSNAME[strength];
|
||||
|
||||
output.println(breaker + classname + nfc.normalize(s)
|
||||
+ "<br><tt>" + Utility.hex(s)
|
||||
//+ "<br>" + script
|
||||
//+ "<br>" + UCA.toString(sortKey)
|
||||
|
@ -133,8 +151,15 @@ public class WriteCharts implements UCD_Types {
|
|||
"<td class='q'>",
|
||||
"<td class='t'>",
|
||||
"<td class='s'>",
|
||||
"<td class='p'>",
|
||||
"<td class='f'>"};
|
||||
"<td class='p'>"};
|
||||
|
||||
static final String[] XCLASSNAME = {
|
||||
"<td class='eq'>",
|
||||
"<td class='eq'>",
|
||||
"<td class='eq'>",
|
||||
"<td class='et'>",
|
||||
"<td class='es'>",
|
||||
"<td class='ep'>"};
|
||||
|
||||
|
||||
static PrintWriter indexFile;
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
|
||||
* $Date: 2001/09/19 23:32:21 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2001/10/25 20:35:41 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -63,9 +63,13 @@ public class WriteCollationData implements UCD_Types {
|
|||
String arg = args[i];
|
||||
if (arg.equalsIgnoreCase("WriteRulesWithNames")) writeRules(WITH_NAMES);
|
||||
else if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(collator);
|
||||
else if (arg.equalsIgnoreCase("validateUCA")) GenOverlap.validateUCA(collator);
|
||||
else if (arg.equalsIgnoreCase("writeNonspacingDifference")) writeNonspacingDifference();
|
||||
|
||||
else if (arg.equalsIgnoreCase("WriteCharts")) WriteCharts.test(collator);
|
||||
else if (arg.equalsIgnoreCase("CheckHash")) GenOverlap.checkHash(collator);
|
||||
else if (arg.equalsIgnoreCase("generateRevision")) GenOverlap.generateRevision(collator);
|
||||
else if (arg.equalsIgnoreCase("listCyrillic")) GenOverlap.listCyrillic(collator);
|
||||
|
||||
else if (arg.equalsIgnoreCase("WriteRules")) writeRules(WITHOUT_NAMES);
|
||||
else if (arg.equalsIgnoreCase("WriteRulesXML")) writeRules(IN_XML);
|
||||
|
@ -748,6 +752,47 @@ public class WriteCollationData implements UCD_Types {
|
|||
return len;
|
||||
}
|
||||
|
||||
static void writeNonspacingDifference() throws IOException {
|
||||
PrintWriter diLog = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(GEN_DIR + "UCA_Nonspacing.txt"),
|
||||
"UTF8"),
|
||||
32*1024));
|
||||
diLog.write('\uFEFF');
|
||||
|
||||
Normalizer nfd = new Normalizer(Normalizer.NFD);
|
||||
|
||||
Set sorted = new TreeSet();
|
||||
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
Utility.dot(i);
|
||||
if (!ucd.isRepresented(i)) continue;
|
||||
byte cat = ucd.getCategory(i);
|
||||
boolean isNonSpacing = cat == Mn || cat == Me;
|
||||
CEList celist = collator.getCEList(UTF32.valueOf32(i), true);
|
||||
boolean isPrimaryIgnorable = true;
|
||||
for (int j = 0; j < celist.length(); ++j) {
|
||||
int ce = celist.at(j);
|
||||
int primary = collator.getPrimary(ce);
|
||||
if (primary != 0) {
|
||||
isPrimaryIgnorable = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (isNonSpacing != isPrimaryIgnorable) {
|
||||
sorted.add(ucd.getCategoryID(i)
|
||||
+ "\t" + celist
|
||||
+ "\t" + ucd.getCodeAndName(i));
|
||||
}
|
||||
}
|
||||
|
||||
Utility.print(diLog, sorted, "\r\n");
|
||||
|
||||
diLog.close();
|
||||
}
|
||||
|
||||
static void writeContractions() throws IOException {
|
||||
PrintWriter diLog = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
|
|
Loading…
Add table
Reference in a new issue