Added table of information to fractional UCA, changed XML, regenerated collation test

X-SVN-Rev: 8921
This commit is contained in:
Mark Davis 2002-06-22 01:21:11 +00:00
parent b5b02ebdd8
commit 4482f497ce
13 changed files with 686 additions and 76 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
* $Date: 2002/06/15 03:15:55 $
* $Revision: 1.20 $
* $Date: 2002/06/22 01:21:08 $
* $Revision: 1.21 $
*
*******************************************************************************
*/
@ -1411,7 +1411,7 @@ F900..FAFF; CJK Compatibility Ideographs
int lastLen = -1;
int[] lastCes = new int[50];
long variableTop = collator.getVariableHigh() & 0xFFFFFFFFL;
long variableTop = collator.getVariableHigh() & INT_MASK;
// for debugging ordering
String lastSortKey = "";
@ -1683,7 +1683,7 @@ F900..FAFF; CJK Compatibility Ideographs
// static final String[] RELATION_NAMES = {" <", " <<", " <<<", " ="};
static final String[] RELATION_NAMES = {" <\t", " <<\t", " <<<\t", " =\t"};
static final String[] XML_RELATION_NAMES = {"g1", "g2", "g3", "eq"};
static final String[] XML_RELATION_NAMES = {"p", "s", "t", "eq"};
static class ArrayWrapper {
int[] array;
@ -2278,17 +2278,27 @@ F900..FAFF; CJK Compatibility Ideographs
log.println("# - Differs from previous version in that MAX value was introduced at 1F.");
log.println("# All tertiary values are shifted down by 1, filling the gap at 7!");
int firstImplicit = getImplicitPrimary(CJK_BASE) >>> 24;
int lastImplicit = getImplicitPrimary(0x10FFFF) >>> 24;
log.println("[FIRST_IMPLICIT= " + Utility.hex(firstImplicit) + "]");
log.println("[LAST_IMPLICIT= " + Utility.hex(lastImplicit) + "]");
String lastChr = "";
int lastNp = 0;
boolean doVariable = false;
char[] codeUnits = new char[100];
FCE firstSecondaryIgnorable = new FCE(false);
FCE lastSecondaryIgnorable = new FCE(true);
FCE firstPrimaryIgnorable = new FCE(false);
FCE lastPrimaryIgnorable = new FCE(true);
FCE firstVariable = new FCE(false);
FCE lastVariable = new FCE(true);
FCE firstNonIgnorable = new FCE(false);
FCE lastNonIgnorable = new FCE(true);
FCE firstTrailing = new FCE(false);
FCE lastTrailing = new FCE(true);
while (it.hasNext()) {
Object sortKey = it.next();
String chr = (String)ordered.get(sortKey);
@ -2334,7 +2344,7 @@ F900..FAFF; CJK Compatibility Ideographs
// special treatment for unsupported!
if (UCA.isImplicitLeadPrimary(pri)) {
System.out.println("DEBUG: " + CEList.toString(ces, len)
if (DEBUG) System.out.println("DEBUG: " + CEList.toString(ces, len)
+ ", Current: " + q + ", " + ucd.getCodeAndName(chr));
++q;
oldStr.append(CEList.toString(ces[q]));// + "," + Integer.toString(ces[q],16);
@ -2359,7 +2369,7 @@ F900..FAFF; CJK Compatibility Ideographs
+ " => " + Utility.hex(cp)
+ " => " + Utility.hex(testImplicit[0])
+ ", " + Utility.hex(testImplicit[1])
// + ", " + Utility.hex(fixPrimary(pri) & 0xFFFFFFFFL)
// + ", " + Utility.hex(fixPrimary(pri) & INT_MASK)
);
}
@ -2377,24 +2387,26 @@ F900..FAFF; CJK Compatibility Ideographs
// int oldPrimaryValue = UCA.getPrimary(ces[q]);
int np = fixPrimary(pri);
int ns = fixSecondary(sec);
int nt = fixTertiary(ter);
try {
hexBytes(np, newPrimary);
hexBytes(fixSecondary(sec), newSecondary);
hexBytes(fixTertiary(ter), newTertiary);
hexBytes(ns, newSecondary);
hexBytes(nt, newTertiary);
} catch (Exception e) {
throw new ChainException("Character is {0}", new String[] {Utility.hex(chr)}, e);
}
if (isFirst) {
if (!sameTopByte(np, lastNp)) {
summary.println("Last: " + Utility.hex(lastNp & 0xFFFFFFFFL) + " " + ucd.getName(UTF16.charAt(lastChr,0)));
summary.println("Last: " + Utility.hex(lastNp & INT_MASK) + " " + ucd.getName(UTF16.charAt(lastChr,0)));
summary.println();
if (doVariable) {
doVariable = false;
summary.println("[variable top = " + Utility.hex(primaryDelta[firstPrimary]) + "] # END OF VARIABLE SECTION!!!");
summary.println();
}
summary.println("First: " + Utility.hex(np & 0xFFFFFFFFL) + ", " + ucd.getCodeAndName(UTF16.charAt(chr,0)));
summary.println("First: " + Utility.hex(np & INT_MASK) + ", " + ucd.getCodeAndName(UTF16.charAt(chr,0)));
}
lastNp = np;
isFirst = false;
@ -2403,6 +2415,27 @@ F900..FAFF; CJK Compatibility Ideographs
+ ", " + newSecondary
+ ", " + newTertiary
+ "]");
// RECORD STATS
if (np == 0 && ns == 0) {
firstSecondaryIgnorable.setValue(np, ns, nt);
lastSecondaryIgnorable.setValue(np, ns, nt);
} else if (np == 0) {
firstPrimaryIgnorable.setValue(np, ns, nt);
lastPrimaryIgnorable.setValue(np, ns, nt);
} else if (collator.isVariable(ces[q])) {
firstVariable.setValue(np, ns, nt);
lastVariable.setValue(np, ns, nt);
} else if (UCA.getPrimary(ces[q]) > UNSUPPORTED_LIMIT) { // Trailing (none currently)
System.out.println("Trailing: " + CEList.toString(ces[q])
+ ", " + Utility.hex(pri) + ", " + Utility.hex(UNSUPPORTED_LIMIT));
firstTrailing.setValue(np, ns, nt);
lastTrailing.setValue(np, ns, nt);
} else if ((pri & MARK_CODE_POINT) == 0) { // skip implicits
firstNonIgnorable.setValue(np, ns, nt);
lastNonIgnorable.setValue(np, ns, nt);
}
}
if (nonePrinted) {
log.print("[,,]");
@ -2412,6 +2445,61 @@ F900..FAFF; CJK Compatibility Ideographs
log.println();
lastChr = chr;
}
int firstImplicit = getImplicitPrimary(CJK_BASE);
int lastImplicit = getImplicitPrimary(0x10FFFF);
log.println("# VALUES BASED ON UCA");
log.println("[first tertiary ignorable " + new FCE(false,0,0, 0).formatFCE() + "]");
log.println("[last tertiary ignorable " + new FCE(true,0,0, 0).formatFCE() + "]");
// Since the UCA doesn't have secondary ignorables, fake them.
if (firstSecondaryIgnorable.isUnset()) {
System.out.println("No first/last secondary ignorable: resetting");
firstSecondaryIgnorable = new FCE(false, 0, 0, COMMON<<24);
lastSecondaryIgnorable = new FCE(true, 0, 0, COMMON<<24);
System.out.println(firstSecondaryIgnorable.formatFCE());
}
log.println("[first secondary ignorable " + firstSecondaryIgnorable.formatFCE() + "]");
log.println("[last secondary ignorable " + lastSecondaryIgnorable.formatFCE() + "]");
log.println("[first primary ignorable " + firstPrimaryIgnorable.formatFCE() + "]");
log.println("[last primary ignorable " + lastPrimaryIgnorable.formatFCE() + "]");
log.println("[first variable " + firstVariable.formatFCE() + "]");
log.println("[last variable " + lastVariable.formatFCE() + "]");
log.println("[first non-ignorable " + firstNonIgnorable.formatFCE() + "]");
log.println("[last non-ignorable " + lastNonIgnorable.formatFCE() + "]");
log.println("[first implicit " + (new FCE(false,firstImplicit, COMMON<<24, COMMON<<24)).formatFCE() + "]");
log.println("[last implicit " + (new FCE(false,lastImplicit, COMMON<<24, COMMON<<24)).formatFCE() + "]");
if (firstTrailing.isUnset()) {
System.out.println("No first/last trailing: resetting");
firstTrailing = new FCE(false, (IMPLICIT_LIMIT_BYTE+1)<<24, COMMON<<24, COMMON<<24);
lastTrailing = new FCE(true, (IMPLICIT_LIMIT_BYTE+1)<<24, COMMON<<24, COMMON<<24);
System.out.println(firstTrailing.formatFCE());
}
log.println("[first trailing " + firstTrailing.formatFCE() + "]");
log.println("[last trailing " + lastTrailing.formatFCE() + "]");
log.println("# FIXED VALUES");
log.println("[top " + Utility.hex(0xA0,2) + "]");
log.println("[first implicit byte " + Utility.hex(IMPLICIT_BASE_BYTE,2) + "]");
log.println("[last implicit byte " + Utility.hex(IMPLICIT_LIMIT_BYTE,2) + "]");
log.println("[first trail byte" + Utility.hex(IMPLICIT_LIMIT_BYTE+1,2) + "]");
log.println("[last implicit byte" + Utility.hex(SPECIAL_BASE-1,2) + "]");
log.println("[first special byte" + Utility.hex(SPECIAL_BASE,2) + "]");
log.println("[last special byte" + Utility.hex(0xFF,2) + "]");
summary.println("Last: " + Utility.hex(lastNp) + ", " + ucd.getCodeAndName(UTF16.charAt(lastChr, 0)));
/*
@ -2423,19 +2511,19 @@ F900..FAFF; CJK Compatibility Ideographs
}
*/
summary.println();
summary.println("# First Implicit: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0)));
summary.println("# Last Implicit: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x10FFFF)));
summary.println("# First CJK: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x4E00)));
summary.println("# Last CJK: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0xFA2F)));
summary.println("# First CJK_A: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x3400)));
summary.println("# Last CJK_A: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x4DBF)));
summary.println("# First Implicit: " + Utility.hex(INT_MASK & getImplicitPrimary(0)));
summary.println("# Last Implicit: " + Utility.hex(INT_MASK & getImplicitPrimary(0x10FFFF)));
summary.println("# First CJK: " + Utility.hex(INT_MASK & getImplicitPrimary(0x4E00)));
summary.println("# Last CJK: " + Utility.hex(INT_MASK & getImplicitPrimary(0xFA2F)));
summary.println("# First CJK_A: " + Utility.hex(INT_MASK & getImplicitPrimary(0x3400)));
summary.println("# Last CJK_A: " + Utility.hex(INT_MASK & getImplicitPrimary(0x4DBF)));
boolean lastOne = false;
for (int i = 0; i < 0x10FFFF; ++i) {
boolean thisOne = ucd.isCJK_BASE(i) || ucd.isCJK_AB(i);
if (thisOne != lastOne) {
summary.println("# Implicit Cusp: CJK=" + lastOne + ": " + Utility.hex(i-1) + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(i-1)));
summary.println("# Implicit Cusp: CJK=" + thisOne + ": " + Utility.hex(i) + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(i)));
summary.println("# Implicit Cusp: CJK=" + lastOne + ": " + Utility.hex(i-1) + " => " + Utility.hex(INT_MASK & getImplicitPrimary(i-1)));
summary.println("# Implicit Cusp: CJK=" + thisOne + ": " + Utility.hex(i) + " => " + Utility.hex(INT_MASK & getImplicitPrimary(i)));
lastOne = thisOne;
}
}
@ -2479,6 +2567,104 @@ F900..FAFF; CJK Compatibility Ideographs
summary.close();
}
static final long INT_MASK = 0xFFFFFFFFL;
static class FCE {
static final long UNDEFINED_MAX = Long.MAX_VALUE;
static final long UNDEFINED_MIN = Long.MIN_VALUE;
long[] key;
boolean max;
boolean debugShow = false;
FCE (boolean max) {
this.max = max;
if (max) key = new long[] {UNDEFINED_MIN, UNDEFINED_MIN, UNDEFINED_MIN}; // make small!
else key = new long[] {UNDEFINED_MAX, UNDEFINED_MAX, UNDEFINED_MAX};
}
FCE (boolean max, int primary, int secondary, int tertiary) {
this(max);
key[0] = primary & INT_MASK;
key[1] = secondary & INT_MASK;
key[2] = tertiary & INT_MASK;
}
FCE (boolean max, int primary) {
this(max);
key[0] = primary & INT_MASK;
}
boolean isUnset() {
return key[0] == UNDEFINED_MIN || key[0] == UNDEFINED_MAX;
}
String formatFCE() {
String b0 = getBuffer(key[0], false);
boolean key0Defined = key[0] != UNDEFINED_MIN && key[0] != UNDEFINED_MAX;
String b1 = getBuffer(key[1], key0Defined);
boolean key1Defined = key[1] != UNDEFINED_MIN && key[1] != UNDEFINED_MAX;
if (b1.length() != 0) b1 = " " + b1;
String b2 = getBuffer(key[2], key0Defined || key1Defined);
if (b2.length() != 0) b2 = " " + b2;
return "[" + b0 + "," + b1 + "," + b2 + "]";
}
String getBuffer(long val, boolean haveHigher) {
if (val == UNDEFINED_MIN) return "?";
if (val == UNDEFINED_MAX) if (haveHigher) val = COMMON << 24; else return "?";
StringBuffer result = new StringBuffer();
hexBytes(val, result);
return result.toString();
}
void setValue(int npInt, int nsInt, int ntInt) {
if (debugShow) System.out.println("Setting FCE: "
+ Utility.hex(npInt) + ", " + Utility.hex(nsInt) + ", " + Utility.hex(ntInt));
// to get the sign right!
long np = npInt & INT_MASK;
long ns = nsInt & INT_MASK;
long nt = ntInt & INT_MASK;
if (max) {
if (np < key[0]) return;
if (np > key[0]) {
key[0] = np;
key[1] = ns;
key[2] = nt;
return;
}
if (ns < key[1]) return;
if (ns > key[1]) {
key[1] = ns;
key[2] = nt;
return;
}
if (nt > key[2]) {
key[2] = nt;
}
} else {
if (np > key[0]) return;
if (np < key[0]) {
key[0] = np;
key[1] = ns;
key[2] = nt;
return;
}
if (ns > key[1]) return;
if (ns < key[1]) {
key[1] = ns;
key[2] = nt;
return;
}
if (nt > key[2]) {
key[2] = nt;
}
}
}
}
/*
static boolean isFixedIdeograph(int cp) {
return (0x3400 <= cp && cp <= 0x4DB5
@ -2566,9 +2752,12 @@ static int swapCJK(int i) {
return i + NON_CJK_OFFSET; // non-CJK
}
// CONSTANTS
// Fractional UCA Generation Constants
static final int
TOP = 0xA0,
SPECIAL_BASE = 0xF0,
NON_CJK_OFFSET = 0x110000,
BYTES_TO_AVOID = 3,
OTHER_COUNT = 256 - BYTES_TO_AVOID,
@ -2659,12 +2848,12 @@ static int swapCJK(int i) {
static void showImplicit2(String title, int cp) {
System.out.println(title + ":\t" + Utility.hex(cp)
+ " => " + Utility.hex(swapCJK(cp))
+ " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp)));
+ " => " + Utility.hex(INT_MASK & getImplicitPrimary(cp)));
}
static void showImplicit3(String title, int cp) {
System.out.println("*" + title + ":\t" + Utility.hex(cp)
+ " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimaryFromSwapped(cp)));
+ " => " + Utility.hex(INT_MASK & getImplicitPrimaryFromSwapped(cp)));
}
// TEST PROGRAM
@ -2679,7 +2868,7 @@ static int swapCJK(int i) {
// test monotonically increasing
for (int i = 0; i < 0x21FFFF; ++i) {
long newPrimary = 0xFFFFFFFFL & getImplicitPrimaryFromSwapped(i);
long newPrimary = INT_MASK & getImplicitPrimaryFromSwapped(i);
if (newPrimary < oldPrimary) {
throw new IllegalArgumentException(Utility.hex(i) + ": overlap: "
+ Utility.hex(oldChar) + " (" + Utility.hex(oldPrimary) + ")"
@ -2730,7 +2919,7 @@ static int swapCJK(int i) {
}
long newPrimary = 0xFFFFFFFFL & getImplicitPrimary(i);
long newPrimary = INT_MASK & getImplicitPrimary(i);
// test correct values

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
* $Date: 2002/05/31 01:41:04 $
* $Revision: 1.14 $
* $Date: 2002/06/22 01:21:09 $
* $Revision: 1.15 $
*
*******************************************************************************
*/
@ -22,6 +22,10 @@ public final class DerivedProperty implements UCD_Types {
// ADD CONSTANT to UCD_TYPES
static public UnicodeProperty make(int derivedPropertyID) {
return make(derivedPropertyID, Default.ucd);
}
static public UnicodeProperty make(int derivedPropertyID, UCD ucd) {
if (derivedPropertyID < 0 || derivedPropertyID >= DERIVED_PROPERTY_LIMIT) return null;
DerivedProperty dp = getCached(ucd);

View file

@ -5,17 +5,19 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $
* $Date: 2002/05/29 02:01:00 $
* $Revision: 1.6 $
* $Date: 2002/06/22 01:21:09 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.icu.text.UnicodeSet;
import java.io.*;
class DiffPropertyLister extends PropertyLister {
private UCD oldUCD;
private UnicodeSet set = new UnicodeSet();
private static final int NOPROPERTY = -1;
public DiffPropertyLister(String oldUCDName, String newUCDName, PrintWriter output, int property) {
@ -34,6 +36,10 @@ class DiffPropertyLister extends PropertyLister {
public DiffPropertyLister(String oldUCDName, String newUCDName, PrintWriter output) {
this(oldUCDName, newUCDName, output, NOPROPERTY);
}
public UnicodeSet getSet() {
return set;
}
public String valueName(int cp) {
return major_minor_only(ucdData.getVersion());
@ -64,7 +70,13 @@ class DiffPropertyLister extends PropertyLister {
public byte status(int cp) {
if (newProp == null) {
return ucdData.isAllocated(cp) && (oldUCD == null || !oldUCD.isAllocated(cp)) ? INCLUDE : EXCLUDE;
if (ucdData.isAllocated(cp) && (oldUCD == null || !oldUCD.isAllocated(cp))) {
set.add(cp);
return INCLUDE;
}
else {
return EXCLUDE;
}
}
// just look at property differences among allocated characters
@ -74,7 +86,10 @@ class DiffPropertyLister extends PropertyLister {
String val = newProp.getValue(cp);
String oldVal = oldProp.getValue(cp);
if (!oldVal.equals(val)) return INCLUDE;
if (!oldVal.equals(val)) {
set.add(cp);
return INCLUDE;
}
return EXCLUDE;
/*if (cp == 0xFFFF) {

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
* $Date: 2002/05/31 01:41:04 $
* $Revision: 1.19 $
* $Date: 2002/06/22 01:21:09 $
* $Revision: 1.20 $
*
*******************************************************************************
*/
@ -1083,6 +1083,9 @@ public class GenerateData implements UCD_Types {
String newFile = directory + filename + getFileSuffix(true);
PrintWriter log = Utility.openPrintWriter(newFile);
String mostRecent = generateBat(directory, filename, getFileSuffix(true));
DiffPropertyLister dpl;
UnicodeSet cummulative = new UnicodeSet();
try {
for (int i = 0; i < list.length; ++i) {
int prop = list[i];
@ -1094,29 +1097,60 @@ public class GenerateData implements UCD_Types {
//new DiffPropertyLister("3.2.0", "1.1.0", log, prop).print();
log.println();
log.println(HORIZONTAL_LINE);
new DiffPropertyLister("3.2.0", "2.0.0", log, prop).print();
log.println();
dpl = new DiffPropertyLister("3.2.0", "2.0.0", log, prop);
dpl.print();
cummulative.addAll(dpl.getSet());
log.println(HORIZONTAL_LINE);
new DiffPropertyLister("3.2.0", "2.1.2", log, prop).print();
log.println();
dpl = new DiffPropertyLister("3.2.0", "2.1.2", log, prop);
dpl.print();
cummulative.addAll(dpl.getSet());
log.println(HORIZONTAL_LINE);
new DiffPropertyLister("3.2.0", "2.1.5", log, prop).print();
log.println();
dpl = new DiffPropertyLister("3.2.0", "2.1.5", log, prop);
dpl.print();
cummulative.addAll(dpl.getSet());
log.println(HORIZONTAL_LINE);
new DiffPropertyLister("3.2.0", "2.1.8", log, prop).print();
log.println();
dpl = new DiffPropertyLister("3.2.0", "2.1.8", log, prop);
dpl.print();
cummulative.addAll(dpl.getSet());
log.println(HORIZONTAL_LINE);
new DiffPropertyLister("3.2.0", "3.0.0", log, prop).print();
log.println(HORIZONTAL_LINE);
log.println();
new DiffPropertyLister("3.2.0", "3.0.1", log, prop).print();
dpl = new DiffPropertyLister("3.2.0", "3.0.0", log, prop);
dpl.print();
cummulative.addAll(dpl.getSet());
log.println(HORIZONTAL_LINE);
log.println();
new DiffPropertyLister("3.2.0", "3.1.0", log, prop).print();
dpl = new DiffPropertyLister("3.2.0", "3.0.1", log, prop);
dpl.print();
cummulative.addAll(dpl.getSet());
log.println(HORIZONTAL_LINE);
log.println();
new DiffPropertyLister("3.2.0", "3.1.1", log, prop).print();
dpl = new DiffPropertyLister("3.2.0", "3.1.0", log, prop);
dpl.print();
cummulative.addAll(dpl.getSet());
log.println(HORIZONTAL_LINE);
log.println();
dpl = new DiffPropertyLister("3.2.0", "3.1.1", log, prop);
dpl.print();
cummulative.addAll(dpl.getSet());
log.println(HORIZONTAL_LINE);
log.println();
log.println("Cummulative differences");
UnicodeProperty up = DerivedProperty.make(prop, Default.ucd);
UnicodeSet newProp = up.getSet();
Utility.showSetNames(log, "", cummulative.removeAll(newProp), false, false, Default.ucd);
}
} finally {
if (log != null) {

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2002/06/15 02:47:14 $
* $Revision: 1.16 $
* $Date: 2002/06/22 01:21:09 $
* $Revision: 1.17 $
*
*******************************************************************************
*/
@ -83,7 +83,8 @@ public final class Main implements UCD_Types {
else if (arg.equalsIgnoreCase("checkCaseShort")) VerifyUCD.checkCase2(false);
else if (arg.equalsIgnoreCase("checkCanonicalProperties")) VerifyUCD.checkCanonicalProperties();
else if (arg.equalsIgnoreCase("CheckCaseFold")) VerifyUCD.CheckCaseFold();
else if (arg.equalsIgnoreCase("idn")) VerifyUCD.VerifyIDN();
else if (arg.equalsIgnoreCase("genIDN")) VerifyUCD.genIDN();
else if (arg.equalsIgnoreCase("VerifyIDN")) VerifyUCD.VerifyIDN();
else if (arg.equalsIgnoreCase("NFTest")) VerifyUCD.NFTest();
else if (arg.equalsIgnoreCase("test1")) VerifyUCD.test1();
else if (arg.equalsIgnoreCase("TrailingZeros")) GenerateData.genTrailingZeros();

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2002/06/15 02:47:13 $
* $Revision: 1.14 $
* $Date: 2002/06/22 01:21:09 $
* $Revision: 1.15 $
*
*******************************************************************************
*/
@ -1013,7 +1013,7 @@ to guarantee identifier closure.
if (fixStrings) {
if (result.name == null) {
result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
System.out.println("Warning: fixing name for " + result.name);
// System.out.println("Warning: fixing name for " + result.name);
}
if (result.shortName == null) {
result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java,v $
* $Date: 2002/03/15 01:57:01 $
* $Revision: 1.6 $
* $Date: 2002/06/22 01:21:09 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
@ -23,6 +23,10 @@ final class UnifiedBinaryProperty extends UnicodeProperty {
int propValue;
// DerivedProperty dp;
public static UnicodeProperty make(int propMask) {
return make(propMask, Default.ucd);
}
public static UnicodeProperty make(int propMask, UCD ucd) {
if ((propMask & 0xFF00) == DERIVED) {
return DerivedProperty.make(propMask & 0xFF, ucd);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
* $Date: 2002/06/15 02:47:12 $
* $Revision: 1.16 $
* $Date: 2002/06/22 01:21:09 $
* $Revision: 1.17 $
*
*******************************************************************************
*/
@ -22,11 +22,14 @@ import java.io.*;
//import java.text.Un;
import com.ibm.icu.text.CanonicalIterator;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.text.UTF16;
import com.ibm.text.utility.*;
import java.text.NumberFormat;
public class VerifyUCD implements UCD_Types {
static final boolean DEBUG = false;
static void oneTime() {
Default.setUCD();
@ -1099,7 +1102,288 @@ can help you narrow these down.
}
probe.put(new Integer(cp), Default.ucd.getCodeAndName(cp) + " (" + Default.ucd.getCategoryID(cp) + ")" + option);
}
static void showDifferences(PrintWriter log, UnicodeSet s1, String name1, UnicodeSet s2, String name2, boolean both) {
if (!s1.equals(s2)) {
log.println();
log.println("In " + name1 + ", but NOT " + name2);
Utility.showSetNames(log," ", new UnicodeSet(s1).removeAll(s2), false, false, Default.ucd);
log.println();
log.println("NOT in " + name1 + ", but in " + name2);
Utility.showSetNames(log," ", new UnicodeSet(s2).removeAll(s1), false, false, Default.ucd);
log.println();
if (both) {
log.println("In both " + name1 + " AND " + name2);
Utility.showSetNames(log," ", new UnicodeSet(s2).retainAll(s1), false, false, Default.ucd);
log.println();
}
}
}
public static void genIDN() throws IOException {
PrintWriter out = new PrintWriter(System.out);
Default.setUCD();
PrintWriter log = Utility.openPrintWriter("IDN-tables.txt");
/*UnicodeSet y = UnifiedBinaryProperty.make(CATEGORY + FORMAT).getSet();
UnicodeSet x = new UnicodeSet(0xE0001,0xE007F).retainAll(y);
System.out.println("y: " + y.toPattern(true));
System.out.println("x: " + x.toPattern(true));
Utility.showSetNames(out, "* ", x, false, true, Default.ucd);
out.flush();
*/
// table1
System.out.println("Getting Basics");
UnicodeSet unassigned = UnifiedBinaryProperty.make(CATEGORY + UNASSIGNED).getSet();
System.out.print(".");
UnicodeSet lineSeparators = UnifiedBinaryProperty.make(CATEGORY+LINE_SEPARATOR).getSet();
System.out.print(".");
UnicodeSet paraSeparators = UnifiedBinaryProperty.make(CATEGORY+PARAGRAPH_SEPARATOR).getSet();
System.out.print(".");
UnicodeSet spaceSeparators = UnifiedBinaryProperty.make(CATEGORY+SPACE_SEPARATOR).getSet();
System.out.print(".");
UnicodeSet noncharacters = UnifiedBinaryProperty.make(BINARY_PROPERTIES + Noncharacter_Code_Point).getSet();
System.out.print(".");
UnicodeSet deprecated = UnifiedBinaryProperty.make(BINARY_PROPERTIES + Deprecated).getSet();
System.out.print(".");
UnicodeSet format = UnifiedBinaryProperty.make(CATEGORY + FORMAT).getSet();
System.out.print(".");
UnicodeSet bidi_control = UnifiedBinaryProperty.make(BINARY_PROPERTIES+Bidi_Control).getSet();
System.out.print(".");
UnicodeSet binary_IDS = UnifiedBinaryProperty.make(BINARY_PROPERTIES+IDS_BinaryOperator).getSet();
System.out.print(".");
UnicodeSet trinary_IDS = UnifiedBinaryProperty.make(BINARY_PROPERTIES+IDS_TrinaryOperator).getSet();
System.out.print(".");
UnicodeSet whitespace = UnifiedBinaryProperty.make(BINARY_PROPERTIES+White_space).getSet();
whitespace.addAll(spaceSeparators); // bug.
System.out.print(".");
UnicodeSet defaultIgnorable = UnifiedBinaryProperty.make(DERIVED + DefaultIgnorable).getSet();
System.out.print(".");
UnicodeSet privateUse = UnifiedBinaryProperty.make(CATEGORY+PRIVATE_USE).getSet();
System.out.print(".");
UnicodeSet control = UnifiedBinaryProperty.make(CATEGORY+Cc).getSet();
System.out.print(".");
UnicodeSet surrogate = UnifiedBinaryProperty.make(CATEGORY+SURROGATE).getSet();
System.out.println("Building Sets");
// small test:
if (DEBUG) {
showDifferences(log, whitespace, "White_Space",
new UnicodeSet(spaceSeparators).addAll(lineSeparators).addAll(paraSeparators), "Separators", true);
showDifferences(log, UnifiedBinaryProperty.make(DERIVED + ID_Start).getSet(), "ID_Start",
UnifiedBinaryProperty.make(DERIVED + Mod_ID_Start).getSet(), "XID_Start", false);
showDifferences(log, UnifiedBinaryProperty.make(DERIVED + ID_Continue_NO_Cf).getSet(), "ID_Continue",
UnifiedBinaryProperty.make(DERIVED + Mod_ID_Continue_NO_Cf).getSet(), "XID_Continue", false);
System.out.println("Done with Test");
}
UnicodeSet A1 = new UnicodeSet(unassigned).removeAll(noncharacters);
// special code for B1
/*
B1, old
00AD; SOFT HYPHEN
1806; MONGOLIAN TODO SOFT HYPHEN
180B; MONGOLIAN FREE VARIATION SELECTOR ONE
180C; MONGOLIAN FREE VARIATION SELECTOR TWO
180D; MONGOLIAN FREE VARIATION SELECTOR THREE
200B; ZERO WIDTH SPACE
200C; ZERO WIDTH NON-JOINER
200D; ZERO WIDTH JOINER
FEFF; ZERO WIDTH NO-BREAK SPACE
*/
UnicodeSet B1 = new UnicodeSet().add(0xAD).add(0x1806).add(0x034F); // START WITH soft hyphen, mongolian soft hyphen, grapheme joiner
// THEN ADD default ignorables or format characters that are *variation* or *zero width*
UnicodeSet temp = new UnicodeSet(defaultIgnorable).addAll(format).addAll(spaceSeparators)
.removeAll(surrogate).removeAll(control); // remove some just to avoid clutter when debugging.
UnicodeSetIterator it = new UnicodeSetIterator(temp);
while(it.next()) {
if (!Default.ucd.isAssigned(it.codepoint)) continue;
String name = Default.ucd.getName(it.codepoint);
System.out.print(Default.ucd.getCodeAndName(it.codepoint));
if (name.indexOf("VARIATION") >= 0 || name.indexOf("ZERO") >= 0
|| name.indexOf("WORD JOINER") >= 0) {
B1.add(it.codepoint);
System.out.print("*");
}
System.out.println();
}
UnicodeSet C1 = new UnicodeSet(whitespace).removeAll(control).removeAll(lineSeparators)
.removeAll(paraSeparators);
UnicodeSet C2 = new UnicodeSet(defaultIgnorable).removeAll(unassigned).removeAll(surrogate)
.addAll(control).addAll(format).addAll(lineSeparators).addAll(paraSeparators);
UnicodeSet C3 = new UnicodeSet(privateUse);
UnicodeSet C4 = new UnicodeSet(noncharacters);
UnicodeSet C5 = new UnicodeSet(surrogate);
UnicodeSet C6 = new UnicodeSet(0xFFF9, 0xFFFC).add(0xFFFD);
UnicodeSet C7 = new UnicodeSet(binary_IDS).addAll(trinary_IDS);
UnicodeSet C8 = new UnicodeSet(deprecated).addAll(bidi_control);
UnicodeSet C9 = new UnicodeSet(0xE0001,0xE007F).retainAll(format);
//Utility.showSetNames(out, "\t&&& ", C9, false, true, Default.ucd);
//out.flush();
// FIX UP SETS!!
B1.removeAll(C6);
B1.removeAll(C8);
B1.removeAll(C9);
C1.removeAll(B1);
C2.removeAll(B1);
C2.removeAll(C6);
C2.removeAll(C8);
C2.removeAll(C9);
System.out.println("Check that A1, B1, C1..9 are disjoint");
UnicodeSet[] test = {A1, B1, C1, C2, C3, C4, C5, C6, C7, C8, C9};
String[] testNames = {"A1", "B1", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9"};
UnicodeSet union = new UnicodeSet();
for (int i = 0; i < test.length; ++i) {
union.addAll(test[i]);
for (int j = i + 1; j < test.length; ++j) {
if (test[i].containsNone(test[j])) continue;
log.println(testNames[i] + " and " + testNames[j] + " intersect!");
UnicodeSet intersection = new UnicodeSet(test[i]).retainAll(test[j]);
Utility.showSetNames(log," ", intersection, false, true, Default.ucd);
log.println();
}
}
System.out.println("Check that union works");
UnicodeSet[] badChars = {unassigned, noncharacters, deprecated, format,
control, surrogate, privateUse, binary_IDS, trinary_IDS, whitespace, defaultIgnorable,
lineSeparators, paraSeparators, spaceSeparators};
UnicodeSet badCharUnion = new UnicodeSet();
for (int i = 0; i < badChars.length; ++i) {
badCharUnion.addAll(badChars[i]);
}
showDifferences(log, union, "(A1+B1+C1-C9)",
badCharUnion,
"(Whitespace+Deprecated+DefaultIgnorable+Separator+Other (cont/format/surr/priv/unass))", false);
System.out.println("Generating B2, B3");
log.println("Generating B2, B3");
Map B2 = new TreeMap();
Map B3 = new TreeMap();
Integer tempInteger = null;
for (int i = 0; i < 0x10FFFF; ++i) {
int cat = Default.ucd.getCategory(i);
if (!Default.ucd.isAssigned(i)) continue;
//if (cat == Cc || cat == Cf || cat == Co || cat == Cn) continue; // we can skip these
//if (Default.ucd.hasComputableName(i)) continue;
tempInteger = null;
String original = UTF16.valueOf(i);
String caseFold = Default.ucd.getCase(i, FULL, FOLD);
if (!original.equals(caseFold)) {
tempInteger = new Integer(i);
B2.put(tempInteger, caseFold);
B3.put(tempInteger, caseFold);
}
String b = Default.nfkc.normalize(caseFold);
String c = Default.nfkc.normalize(Default.ucd.getCase(b, FULL, FOLD));
if (!c.equals(b)) {
if (tempInteger != null) {
if (DEBUG) {
log.println("Possible Conflict");
log.println(" " + Default.ucd.getCodeAndName(i));
log.println(" => " + Default.ucd.getCodeAndName(caseFold));
log.println(" => " + Default.ucd.getCodeAndName(c));
}
} else {
tempInteger = new Integer(i);
if (DEBUG) {
log.println(" " + Default.ucd.getCodeAndName(i));
log.println(" => " + Default.ucd.getCodeAndName(c));
}
}
if (DEBUG) log.println();
B2.put(tempInteger, c);
}
}
// PRINTOUT
printIDN_Table(log, "A.1", "Unassigned code points in Unicode " + Default.ucd.getVersion(), A1);
printIDN_Table(log, "B.1", "Commonly mapped to nothing", B1);
printIDN_Map(log, "B.2", "Mapping for lowercase used with NFKC", B2, B3);
printIDN_Map(log, "B.3", "Mapping for lowercase used with no normalization", B3, B2);
printIDN_Table(log, "C.1", "Space characters", C1);
printIDN_Table(log, "C.2", "Control characters", C2);
printIDN_Table(log, "C.3", "Private use", C3);
printIDN_Table(log, "C.4", "Non-character code points", C4);
printIDN_Table(log, "C.5", "Surrogate codes", C5);
printIDN_Table(log, "C.6", "Inappropriate for plain text", C6);
printIDN_Table(log, "C.7", "Inappropriate for canonical representation", C7);
printIDN_Table(log, "C.8", "Change display properties (or deprecated)", C8);
printIDN_Table(log, "C.9", "Tagging characters", C9);
System.out.println("Done");
log.close();
}
public static void printIDN_Map(PrintWriter log, String tableNumber, String description, Map map, Map other) {
System.out.println(tableNumber+ " " + description);
log.println("");
log.println(tableNumber+ " " + description);
log.println("");
log.println("----- Start Table " + tableNumber + " -----");
Iterator it = map.keySet().iterator();
while(it.hasNext()) {
Integer key = (Integer) it.next();
String value = (String) map.get(key);
int cp = key.intValue();
log.println(Utility.hex(cp, 4) + "; " + Utility.hex(value, 4) + "; "
+ (!value.equals(other.get(key))? "***" : "")
+ Default.ucd.getName(cp));
}
log.println("----- End Table " + tableNumber + " -----");
}
public static void printIDN_Table(PrintWriter log, String tableNumber, String description, UnicodeSet set) {
System.out.println(tableNumber+ " " + description);
log.println("");
log.println(tableNumber+ " " + description);
log.println("");
log.println("----- Start Table " + tableNumber + " -----");
Utility.showSetNames(log, "", set, false, true, Default.ucd);
log.println("----- End Table " + tableNumber + " -----");
}
public static BitSet guessIDN() {
BitSet result = new BitSet();
@ -1330,9 +1614,11 @@ E0020-E007F; [TAGGING CHARACTERS]
}
if (line.length() == 0) continue;
if (line.charAt(0) == '-') continue;
int count = Utility.split(line,';',parts);
if (count != 3) throw new ChainException("Incorrect # of fields in IDN folding", null);
if (count != 3) throw new ChainException("Incorrect # of fields in IDN folding, line = {0}",
new String[] {line});
String key = Utility.fromHex(parts[0]);
if (UTF32.length32(key) != 1) throw new ChainException("First IDN field not single character: " + line, null);
@ -1393,8 +1679,12 @@ E0020-E007F; [TAGGING CHARACTERS]
Utility.fixDot();
System.out.println("//" + lineNumber + ": '" + line + "'");
}
int commentPos = line.indexOf(';');
if (commentPos >= 0) line = line.substring(0,commentPos);
line = line.trim();
if (line.length() == 0) continue;
if (line.charAt(0) == '-') continue;
int count = Utility.split(line,'-',parts);
if (count > 2) throw new ChainException("Incorrect # of fields in IDN list", null);

View file

@ -1,7 +1,7 @@
<html><body>
<h1
>1. Mismatches when NFD is OFF</h1><h2
>Date:Fri Jun 14 20:11:26 PDT 2002</h2><h2
>Date:Fri Jun 21 16:56:03 PDT 2002</h2><h2
>File Version:-3.1.1d1</h2><p
>Alternate Handling = NON_IGNORABLE</p><table border="1"
><caption

View file

@ -1,5 +1,5 @@
# Fractional UCA Table, generated from standard UCA
# M. Davis, Fri Jun 14 20:11:34 PDT 2002
# M. Davis, Fri Jun 21 16:56:12 PDT 2002
# VERSION: UCA=3.1.1d1, UCD=3.2.0
# Generated processed version, as described in ICU design document.
@ -16,8 +16,6 @@
# WARNING
# - Differs from previous version in that MAX value was introduced at 1F.
# All tertiary values are shifted down by 1, filling the gap at 7!
[FIRST_IMPLICIT= 00E0]
[LAST_IMPLICIT= 00E3]
0000; [,,]
0001; [,,]
@ -17047,3 +17045,26 @@ D87E DE13; [E1 31 58 57, 05, 05]
D87E DE14; [E1 31 63 42, 05, 05]
D87E DC8F; [E1 31 78 AB, 05, 05]
D87E DE1D; [E1 31 AC 81, 05, 05]
# VALUES BASED ON UCA
[first tertiary ignorable [,,]]
[last tertiary ignorable [,,]]
[first secondary ignorable [,, 05]]
[last secondary ignorable [,, 05]]
[first primary ignorable [, 87, 05]]
[last primary ignorable [, E1 B1, 05]]
[first variable [05 07, 05, 05]]
[last variable [17 9B, 05, 05]]
[first non-ignorable [1A 20, 05, 05]]
[last non-ignorable [78 AA B2, 05, 05]]
[first implicit [E0 03 03, 05, 05]]
[last implicit [E3 DC 70 C0, 05, 05]]
[first trailing [E5, 05, 05]]
[last trailing [E5, 05, 05]]
# FIXED VALUES
[top A0]
[first implicit byte E0]
[last implicit byte E4]
[first trail byteE5]
[last implicit byteEF]
[first special byteF0]
[last special byteFF]

View file

@ -1,5 +1,5 @@
# Fractional UCA Table, generated from standard UCA
# M. Davis, Fri Jun 14 20:11:34 PDT 2002
# M. Davis, Fri Jun 21 16:56:12 PDT 2002
# VERSION: UCA=3.1.1d1, UCD=3.2.0
# Generated processed version, as described in ICU design document.
@ -16,8 +16,6 @@
# WARNING
# - Differs from previous version in that MAX value was introduced at 1F.
# All tertiary values are shifted down by 1, filling the gap at 7!
[FIRST_IMPLICIT= 00E0]
[LAST_IMPLICIT= 00E3]
0000; [,,] # [0000.0000.0000] # <NULL>
0001; [,,] # [0000.0000.0000] # <START OF HEADING>
@ -17047,3 +17045,26 @@ D87E DE13; [E1 31 58 57, 05, 05] # [FF85.0020.0002][A20E.0020.0002] # CJK COM
D87E DE14; [E1 31 63 42, 05, 05] # [FF85.0020.0002][A291.0020.0002] # CJK COMPATIBILITY IDEOGRAPH-2FA14
D87E DC8F; [E1 31 78 AB, 05, 05] # [FF85.0020.0002][A392.0020.0002] # CJK COMPATIBILITY IDEOGRAPH-2F88F
D87E DE1D; [E1 31 AC 81, 05, 05] # [FF85.0020.0002][A600.0020.0002] # CJK COMPATIBILITY IDEOGRAPH-2FA1D
# VALUES BASED ON UCA
[first tertiary ignorable [,,]]
[last tertiary ignorable [,,]]
[first secondary ignorable [,, 05]]
[last secondary ignorable [,, 05]]
[first primary ignorable [, 87, 05]]
[last primary ignorable [, E1 B1, 05]]
[first variable [05 07, 05, 05]]
[last variable [17 9B, 05, 05]]
[first non-ignorable [1A 20, 05, 05]]
[last non-ignorable [78 AA B2, 05, 05]]
[first implicit [E0 03 03, 05, 05]]
[last implicit [E3 DC 70 C0, 05, 05]]
[first trailing [E5, 05, 05]]
[last trailing [E5, 05, 05]]
# FIXED VALUES
[top A0]
[first implicit byte E0]
[last implicit byte E4]
[first trail byteE5]
[last implicit byteEF]
[first special byteF0]
[last special byteFF]

File diff suppressed because one or more lines are too long

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2002/06/13 21:14:05 $
* $Revision: 1.18 $
* $Date: 2002/06/22 01:21:11 $
* $Revision: 1.19 $
*
*******************************************************************************
*/
@ -825,23 +825,54 @@ public final class Utility { // COMMON UTILITIES
return "Showing Stack with fake " + sw.getBuffer().toString();
}
public static void showSetNames(String prefix, UnicodeSet set, boolean all, UCD ucd) {
public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, UCD ucd) {
PrintWriter temp = new PrintWriter(System.out);
showSetNames(temp, prefix, set, separateLines, false, ucd);
temp.close();
}
public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, boolean separateLines, boolean IDN, UCD ucd) {
int count = set.getRangeCount();
for (int i = 0; i < count; ++i) {
int start = set.getRangeStart(i);
int end = set.getRangeEnd(i);
if (all) {
if (separateLines || (IDN && isSeparateLineIDN(start,end,ucd))) {
for (int cp = start; cp <= end; ++cp) {
if (!set.contains(cp)) continue;
System.out.println(prefix + ucd.getCodeAndName(cp));
if (!IDN) pw.println(prefix + ucd.getCodeAndName(cp));
else {
pw.println(prefix + Utility.hex(cp,4) + "; " + ucd.getName(cp));
}
}
} else {
System.out.println(prefix + ucd.getCode(start)
+ ((start != end) ? (".." + ucd.getCode(end)) : "")
+ "\t# " + ucd.getName(start)
+ ((start != end) ? (".." + ucd.getName(end)) : "")
);
if (!IDN) {
pw.println(prefix + ucd.getCode(start)
+ ((start != end) ? (".." + ucd.getCode(end)) : "")
+ "\t# " + ucd.getName(start) + ((start != end) ? (".." + ucd.getName(end)) : "")
);
} else {
pw.println(prefix + Utility.hex(start,4)
+ ((start != end) ? ("-" + Utility.hex(end,4)) : "")
+ (ucd.isAssigned(start)
? "; " + ucd.getName(start) + ((start != end)
? ("-" + ucd.getName(end))
: "")
: "")
);
}
}
}
}
private static boolean isSeparateLineIDN(int cp, UCD ucd) {
if (ucd.hasComputableName(cp)) return false;
int cat = ucd.getCategory(cp);
if (cat == UCD_Types.Cn) return false;
if (ucd.getCategory(cp) == UCD_Types.Cc && !ucd.getBinaryProperty(cp, UCD_Types.White_space)) return false;
return true;
}
private static boolean isSeparateLineIDN(int start, int end, UCD ucd) {
return (isSeparateLineIDN(start, ucd) || isSeparateLineIDN(end, ucd));
}
}