minor mods

X-SVN-Rev: 5703
This commit is contained in:
Mark Davis 2001-09-06 01:30:31 +00:00
parent 9b04837ff0
commit 16682de25d
11 changed files with 330 additions and 52 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $
* $Date: 2001/08/31 00:20:40 $
* $Revision: 1.2 $
* $Date: 2001/09/06 01:30:31 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -19,7 +19,7 @@ import com.ibm.text.UCD.*;
import com.ibm.text.utility.*;
import com.ibm.text.UTF16;
public class GenOverlap {
public class GenOverlap implements UCD_Types {
static Map completes = new TreeMap();
static Map back = new HashMap();
@ -146,15 +146,8 @@ public class GenOverlap {
String [] matchChars = new String[1];
// CEList show = getCEList("\u2034");
log.println("<html><head>");
log.println("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
log.println("<title>New Page 1</title>");
log.println("<style><!--");
log.println("table { border-style: solid; border-width: 1 }");
log.println("td { border-style: solid; border-width: 1 }");
log.println("--></style>");
log.println("</head><body><table>");
Utility.writeHtmlHeader(log, "Overlaps");
log.print("<table>");
while (it.hasNext()) {
Utility.dot(counter++);
@ -500,4 +493,168 @@ public class GenOverlap {
}
log.flush();
}
public static void checkHash(UCA collatorIn) throws Exception {
collator = collatorIn;
System.out.println("# Check Hash");
System.out.println("# Generated " + new Date());
ucd = UCD.make();
//nfd = new Normalizer(Normalizer.NFD);
//nfkd = new Normalizer(Normalizer.NFKD);
UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
nfd = new Normalizer(Normalizer.NFD);
nfkd = new Normalizer(Normalizer.NFKD);
int tableLength = 257;
/*
257 263 269 271 277 281 283 293 307 311 313 317
331 337 347 349 353 359 367 373 379 383 389 397
401 409 419 421 431 433 439 443 449 457 461 463
467 479 487 491 499 503 509 521 523 541 547 557
563 569 571 577 587 593 599 601 607 613 617 619
631 641 643 647 653 659 661 673 677 683 691 701
709 719 727 733 739 743 751 757 761 769 773 787
797 809 811 821 823 827 829 839 853 857 859 863
877 881 883 887 907 911 919 929 937 941 947 953
967 971 977 983 991 997
*/
int [][] collisions = new int[LIMIT_SCRIPT][];
BitSet[] repeats = new BitSet[LIMIT_SCRIPT];
for (int i = 0; i < collisions.length; ++i) {
collisions[i] = new int[tableLength];
repeats[i] = new BitSet();
}
int counter = 0;
int[] lenArray = new int[1];
if (false) while (true) {
Utility.dot(counter++);
String s = cc.next(ces, lenArray);
if (s == null) break;
if (UTF16.countCodePoint(s) != 1) continue; // skip ligatures
int cp = UTF16.charAt(s, 0);
if (nfkd.normalizationDiffers(cp)) continue;
int script = ucd.getScript(cp);
int len = lenArray[0];
for (int i = 0; i < len; ++i) {
int prim = UCA.getPrimary(ces[i]);
int hash = prim % tableLength;
if (!repeats[script].get(prim)) {
++collisions[script][hash];
repeats[script].set(prim);
} else {
System.out.println("Skipping: " + prim + " in " + ucd.getCodeAndName(cp));
}
if (!repeats[UNUSED_SCRIPT].get(prim)) {
++collisions[UNUSED_SCRIPT][hash];
repeats[UNUSED_SCRIPT].set(prim);
}
}
}
String [] latin = new String[tableLength];
for (int i = 0; i < latin.length; ++i) {
latin[i] = "";
}
for (int cp = 0; cp < 0x10FFFF; ++cp) {
Utility.dot(counter++);
if (!ucd.isAllocated(cp)) continue;
if (nfkd.normalizationDiffers(cp)) continue;
if (ucd.getCategory(cp) == Lu) continue; // don't count case
String scp = UTF16.valueOf(cp);
int len = collator.getCEs(scp, true, ces);
int script = ucd.getScript(cp);
for (int i = 0; i < len; ++i) {
int prim = UCA.getPrimary(ces[i]);
int hash = prim % tableLength;
if (!repeats[script].get(prim)) {
++collisions[script][hash];
repeats[script].set(prim);
if (script == LATIN_SCRIPT) latin[hash] += scp;
}
if (!repeats[UNUSED_SCRIPT].get(prim)) {
++collisions[UNUSED_SCRIPT][hash];
repeats[UNUSED_SCRIPT].set(prim);
}
}
}
System.out.println("Data Gathered");
PrintWriter log = Utility.openPrintWriter("checkstringsearchhash.html");
Utility.writeHtmlHeader(log, "Check Hash");
log.println("<h1>Collisions</h1>");
log.println("<p>Shows collisions among primary values when hashed to table size = " + tableLength + ".");
log.println("Note: All duplicate primarys are removed: all non-colliding values are removed.</p>");
log.println("<table><tr><th>Script</th><th>Sum</th><th>Average</th><th>Std Dev.</th></tr>");
for (byte i = 0; i < collisions.length; ++i) {
if (i == UNUSED_SCRIPT) continue;
showCollisions(log, ucd.getScriptID_fromIndex(i), collisions[i]);
}
showCollisions(log, "All", collisions[UNUSED_SCRIPT]);
log.println("</table>");
log.println("<p>Details of collisions for Latin</p>");
for (int i = 0; i < latin.length; ++i) {
if (latin[i].length() < 2) continue;
//if (UTF16.countCodePoint(latin[i]) < 2) continue;
int cp2;
log.println("<table>");
for (int j = 0; j < latin[i].length(); j += UTF16.getCharCount(cp2)) {
cp2 = UTF16.charAt(latin[i], j);
String scp2 = UTF16.valueOf(cp2);
CEList clist = collator.getCEList(scp2, true);
log.println("<tr><td>" + scp2 + "</td><td>" + clist + "</td><td>" + ucd.getCodeAndName(cp2) + "</td></tr>");
}
log.println("</table><br>");
}
log.close();
}
static java.text.NumberFormat nf = new java.text.DecimalFormat("#,##0.00");
static java.text.NumberFormat nf0 = new java.text.DecimalFormat("#,##0");
static void showCollisions(PrintWriter log, String title, int[] curr) {
double sum = 0;
int count = 0;
for (int j = 0; j < curr.length; ++j) {
if (curr[j] == 0) continue;
sum += curr[j];
++count;
}
double average = sum / count;
double sd = 0;
for (int j = 0; j < curr.length; ++j) {
if (curr[j] == 0) continue;
double deviation = curr[j] - average;
sd += deviation * deviation;
}
sd = Math.sqrt(sd / count);
log.println("<tr><td>" + title
+ "</td><td align='right'>" + nf0.format(sum)
+ "</td><td align='right'>" + nf.format(average)
+ "</td><td align='right'>" + nf.format(sd)
+ "</td></tr>");
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
* $Date: 2001/08/31 00:20:40 $
* $Revision: 1.2 $
* $Date: 2001/09/06 01:30:31 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -427,6 +427,32 @@ final public class UCA {
return outpos;
}
/**
* Returns a CEList for a unicode character at a position.
* @param sourceString string to make a sort key for.
* @param offset position in string
* @param decomposition true for UCA, false where the text is guaranteed to be
* normalization form C with no combining marks of class 0.
* @param output array for output. Must be large enough on entry. When done, is terminated with TERMINATOR.
* @return count of CEs
*/
public CEList getCEList(String sourceString, boolean decomposition) {
int len;
while (true) {
try {
len = getCEs(sourceString, decomposition, ceListBuffer);
break;
} catch (ArrayIndexOutOfBoundsException e) {
ceListBuffer = new int[ceListBuffer.length * 2];
}
}
return new CEList(ceListBuffer, 0, len);
}
int[] ceListBuffer = new int[30]; // temporary storage, to avoid multiple creation
/**
* Get Usage
*/

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
* $Date: 2001/08/31 00:20:39 $
* $Revision: 1.2 $
* $Date: 2001/09/06 01:30:30 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -64,6 +64,7 @@ public class WriteCollationData implements UCD_Types {
String arg = args[i];
if (arg.equalsIgnoreCase("WriteRulesWithNames")) writeRules(WITH_NAMES);
else if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(collator);
else if (arg.equalsIgnoreCase("CheckHash")) GenOverlap.checkHash(collator);
else if (arg.equalsIgnoreCase("generateRevision")) GenOverlap.generateRevision(collator);
else if (arg.equalsIgnoreCase("WriteRules")) writeRules(WITHOUT_NAMES);

View file

@ -5,14 +5,15 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
* $Date: 2001/09/01 01:11:13 $
* $Revision: 1.2 $
* $Date: 2001/09/06 01:29:48 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import com.ibm.text.*;
public class DerivedProperty implements UCD_Types {
@ -130,33 +131,61 @@ public class DerivedProperty implements UCD_Types {
}
boolean hasProperty(int cp) {
if (ucdData.getDecompositionType(cp) == NONE) return false;
String cps = UTF32.valueOf32(cp);
if (UTF32.length32(nfx.normalize(cps)) == UTF32.length32(cps)) return true;
String norm = nfx.normalize(cp);
if (UTF16.countCodePoint(norm) != 1) return true;
return false;
}
};
class GenDProp extends DProp {
Normalizer nfx;
Normalizer nfComp = null;
GenDProp (int i) {
nfx = nf[i-GenNFD];
name = NAME[i-GenNFD];
String compName = "the character itself";
if (i == GenNFKC || i == GenNFD) {
name += "-NFC";
nfComp = nfc;
compName = "NFC for the character";
} else if (i == GenNFKD) {
name += "-NFD";
nfComp = nfd;
compName = "NFD for the character";
}
header = "# Derived Property: " + name
+ "\r\n# Normalized forms, where different from the characters themselves."
+ "\r\n# Normalized form " + NAME[i-GenNFD] + ", where DIFFERENT from " + compName + "."
+ "\r\n# HANGUL SYLLABLES are algorithmically decomposed, and not listed explicitly."
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# It is NOT sufficient to replace characters one-by-one with these results!";
}
public boolean propertyVaries() {return true;} // default
public String getProperty(int cp) {
if (ucdData.getDecompositionType(cp) == NONE) return "";
String cps = UTF32.valueOf32(cp);
if (cps.equals(nfx.normalize(cps))) {
return "";
int cacheCp = 0;
String cacheStr = "";
public String getProperty(int cp) {
if (cacheCp == cp) return cacheStr;
cacheCp = cp;
cacheStr = "";
if (ucdData.getDecompositionType(cp) != NONE) {
String cps = UTF32.valueOf32(cp);
String comp = cps;
if (nfComp != null) {
comp = nfComp.normalize(comp);
}
String normal = nfx.normalize(cps);
if (!comp.equals(normal)) {
String norm = Utility.hex(normal);
String pad = Utility.repeat(" ", 14-norm.length());
cacheStr = name + "; " + norm + pad;
}
}
String norm = Utility.hex(nfx.normalize(cp));
String pad = Utility.repeat(" ", 14-norm.length());
return name + "; " + norm + pad;
return cacheStr;
//if (cp >= 0xAC00 && cp <= 0xD7A3) return true;
//System.out.println(Utility.hex(cps) + " => " + Utility.hex(nf[i-4].normalize(cps)));
} // default

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $
* $Date: 2001/09/01 00:06:15 $
* $Revision: 1.3 $
* $Date: 2001/09/06 01:29:48 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -94,6 +94,7 @@ final class DerivedPropertyLister extends PropertyLister {
String prop = dprop.getProperty(cp, propMask);
if (prop.length() == 0) return EXCLUDE;
if (prop.equals(last)) return INCLUDE;
last = prop;
return BREAK;
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
* $Date: 2001/09/01 01:11:13 $
* $Revision: 1.4 $
* $Date: 2001/09/06 01:29:48 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -54,6 +54,10 @@ public class GenerateData implements UCD_Types {
mask = Utility.clearBit(mask, DerivedProperty.FullCompInclusion);
generateDerived(mask, HEADER_DERIVED, "DerivedNormalizationProperties-" + version );
} else if (arg.equalsIgnoreCase("DerivedFullNormalization")) {
mask = Utility.setBits(0, DerivedProperty.GenNFD, DerivedProperty.GenNFKC);
generateDerived(mask, HEADER_DERIVED, "DerivedFullNormalization-" + version );
} else if (arg.equalsIgnoreCase("DerivedEastAsianWidth")) {
generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedEastAsianWidth-" + version );

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2001/09/01 00:06:48 $
* $Revision: 1.1 $
* $Date: 2001/09/06 01:29:48 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
@ -36,6 +36,8 @@ public final class Main {
ConvertUCD.main(new String[]{ucdVersion});
} else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i];
else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test();
else if (arg.equalsIgnoreCase("checkCase")) VerifyUCD.checkCase();
else if (arg.equalsIgnoreCase("checkCase2")) VerifyUCD.checkCase2();

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
* $Date: 2001/09/06 01:29:48 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -254,7 +254,7 @@ public final class Normalizer implements UCD_Types {
int ch;
for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
ch = UTF16Plus.charAt(buffer, j);
ch = UTF16.charAt(buffer, j);
int chClass = data.getCanonicalClass(ch);
int k = target.length(); // insertion point
if (chClass != 0) {
@ -263,7 +263,7 @@ public final class Normalizer implements UCD_Types {
int ch2;
for (; k > 0; k -= UTF16.getCharCount(ch2)) {
ch2 = UTF16Plus.charAt(target, k-1);
ch2 = UTF16.charAt(target, k-1);
if (data.getCanonicalClass(ch2) <= chClass) break;
}
}
@ -281,7 +281,7 @@ public final class Normalizer implements UCD_Types {
*/
private void internalCompose(StringBuffer target) {
int starterPos = 0;
int starterCh = UTF16Plus.charAt(target,0);
int starterCh = UTF16.charAt(target,0);
int compPos = UTF16.getCharCount(starterCh); // length of last composition
int lastClass = data.getCanonicalClass(starterCh);
if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
@ -291,7 +291,7 @@ public final class Normalizer implements UCD_Types {
int ch;
for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
ch = UTF16Plus.charAt(target, decompPos);
ch = UTF16.charAt(target, decompPos);
if (SHOW_PROGRESS) System.out.println(Utility.hex(target)
+ ", decompPos: " + decompPos
+ ", compPos: " + compPos

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
* $Date: 2001/09/01 00:06:15 $
* $Revision: 1.3 $
* $Date: 2001/09/06 01:29:48 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -1027,5 +1027,56 @@ E0020-E007F; [TAGGING CHARACTERS]
}
}
}
static void checkSpeed() {
int count = 1000000;
int sum = 0;
long start, end;
start = System.currentTimeMillis();
for (int i = count; i >= 0; --i) {
sum += dummy2(i).length();
}
end = System.currentTimeMillis();
System.out.println("synchronized: " + (end - start));
start = System.currentTimeMillis();
for (int i = count; i >= 0; --i) {
sum += dummy1(i).length();
}
end = System.currentTimeMillis();
System.out.println("char[] each time: " + (end - start));
start = System.currentTimeMillis();
for (int i = count; i >= 0; --i) {
sum += dummy3(i).length();
}
end = System.currentTimeMillis();
System.out.println("String +: " + (end - start));
System.out.println(sum);
}
static String dummy1(int a) {
char[] temp = new char[2];
temp[0] = (char)(a >>> 16);
temp[1] = (char)a;
return new String(temp);
}
static char[] temp2 = new char[2];
static String dummy2(int a) {
synchronized (temp2) {
temp2[0] = (char)(a >>> 16);
temp2[1] = (char)a;
return new String(temp2);
}
}
static String dummy3(int a) {
return String.valueOf((char)(a >>> 16)) + (char)a;
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/UTF16Plus.java,v $
* $Date: 2001/08/31 00:19:16 $
* $Revision: 1.2 $
* $Date: 2001/09/06 01:29:14 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -14,8 +14,5 @@
package com.ibm.text.utility;
public class UTF16Plus {
public static int charAt(StringBuffer source, int offset16) {
return UTF32.char32At(source, offset16);
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2001/08/31 00:19:16 $
* $Revision: 1.2 $
* $Date: 2001/09/06 01:29:03 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -408,6 +408,7 @@ public final class Utility { // COMMON UTILITIES
private static final String[] searchPath = {
"EXTRAS",
"3.1.2",
"3.1.1",
"3.1.0",
"3.0.1",
@ -451,6 +452,15 @@ public final class Utility { // COMMON UTILITIES
return null;
}
public static void writeHtmlHeader(PrintWriter log, String title) {
log.println("<html><head>");
log.println("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
log.println("<title>" + title + "</title>");
log.println("<style><!--");
log.println("table { border-collapse: collapse; border: 1 solid blue }");
log.println("td { border: 1 solid blue; padding: 2 }");
log.println("th { border: 1 solid blue; padding: 2 }");
log.println("--></style>");
log.println("</head><body>");
}
}