mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-11 08:01:32 +00:00
First check-in
X-SVN-Rev: 5636
This commit is contained in:
parent
b3321bad52
commit
1cd275c205
48 changed files with 20878 additions and 0 deletions
216
tools/unicodetools/com/ibm/text/UCA/CEList.java
Normal file
216
tools/unicodetools/com/ibm/text/UCA/CEList.java
Normal file
|
@ -0,0 +1,216 @@
|
|||
package com.ibm.text.UCA;
|
||||
import com.ibm.text.UCD.*;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public final class CEList implements java.lang.Comparable, UCD_Types {
|
||||
int[] contents;
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
int count;
|
||||
|
||||
public CEList (int[] source, int start, int end) {
|
||||
count = end-start;
|
||||
contents = new int[count];
|
||||
System.arraycopy(source, start, contents, 0, count);
|
||||
startOffset = 0;
|
||||
endOffset = count;
|
||||
}
|
||||
|
||||
public CEList(int[] source) {
|
||||
this(source, 0, source.length);
|
||||
}
|
||||
|
||||
private CEList(int[] source, int start, int end, boolean spare) {
|
||||
contents = source;
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
count = end - start;
|
||||
}
|
||||
|
||||
public CEList append(CEList that) {
|
||||
int[] newContents = new int[count + that.count];
|
||||
System.arraycopy(contents, startOffset, newContents, 0, count);
|
||||
System.arraycopy(that.contents, that.startOffset, newContents, count, that.count);
|
||||
return new CEList(newContents, 0, count + that.count, true);
|
||||
}
|
||||
|
||||
public CEList sub(int start, int end) {
|
||||
return new CEList(contents, start, end, true);
|
||||
}
|
||||
|
||||
public CEList start(int end) {
|
||||
return new CEList(contents, 0, end, true);
|
||||
}
|
||||
|
||||
public CEList end(int start) {
|
||||
return new CEList(contents, start, contents.length, true);
|
||||
}
|
||||
|
||||
public int length() {
|
||||
return count;
|
||||
}
|
||||
|
||||
public int at(int i) {
|
||||
i -= startOffset;
|
||||
if (i < 0 || i >= count) throw new ArrayIndexOutOfBoundsException(i);
|
||||
return contents[i];
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
int result = count;
|
||||
for (int i = startOffset; i < endOffset; ++i) {
|
||||
result *= 37;
|
||||
result += contents[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
try {
|
||||
CEList that = (CEList)other;
|
||||
if (count != that.count) return false;
|
||||
int delta = that.startOffset - startOffset;
|
||||
for (int i = startOffset; i < endOffset; ++i) {
|
||||
if (contents[i] != that.contents[i + delta]) return false;
|
||||
}
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public int compareTo(Object other) {
|
||||
CEList that = (CEList)other;
|
||||
try {
|
||||
int delta = that.startOffset - startOffset;
|
||||
int min = endOffset;
|
||||
int min2 = that.endOffset - delta;
|
||||
if (min > min2) min = min2;
|
||||
|
||||
for (int i = startOffset; i < min; ++i) {
|
||||
if (contents[i] != that.contents[i + delta]) {
|
||||
if (contents[i] < that.contents[i + delta]) return -1;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (count < that.count) return -1;
|
||||
if (count > that.count) return 1;
|
||||
return 0;
|
||||
} catch (RuntimeException e) {
|
||||
System.out.println("This: " + this + ", that: " + other);
|
||||
System.out.println(startOffset + ", " + endOffset
|
||||
+ ", " + count + ", " + contents.length);
|
||||
System.out.println(that.startOffset + ", " + that.endOffset
|
||||
+ ", " + that.count + ", " + that.contents.length);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
public static byte remap(int ch, byte type, int t) {
|
||||
if (type != CANONICAL) {
|
||||
if (0x3041 <= ch && ch <= 0x3094) t = 0xE; // hiragana
|
||||
else if (0x30A1 <= ch && ch <= 0x30FA) t = 0x11; // katakana
|
||||
}
|
||||
switch (type) {
|
||||
case COMPATIBILITY: t = (t == 8) ? 0xA : 4; break;
|
||||
case COMPAT_FONT: t = (t == 8) ? 0xB : 5; break;
|
||||
case COMPAT_NOBREAK: t = 0x1B; break;
|
||||
case COMPAT_INITIAL: t = 0x17; break;
|
||||
case COMPAT_MEDIAL: t = 0x18; break;
|
||||
case COMPAT_FINAL: t = 0x19; break;
|
||||
case COMPAT_ISOLATED: t = 0x1A; break;
|
||||
case COMPAT_CIRCLE: t = (t == 0x11) ? 0x13 : (t == 8) ? 0xC : 6; break;
|
||||
case COMPAT_SUPER: t = 0x14; break;
|
||||
case COMPAT_SUB: t = 0x15; break;
|
||||
case COMPAT_VERTICAL: t = 0x16; break;
|
||||
case COMPAT_WIDE: t= (t == 8) ? 9 : 3; break;
|
||||
case COMPAT_NARROW: t = (0xFF67 <= ch && ch <= 0xFF6F) ? 0x10 : 0x12; break;
|
||||
case COMPAT_SMALL: t = (t == 0xE) ? 0xE : 0xF; break;
|
||||
case COMPAT_SQUARE: t = (t == 8) ? 0x1D : 0x1C; break;
|
||||
case COMPAT_FRACTION: t = 0x1E; break;
|
||||
}
|
||||
return (byte)t;
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = startOffset; i < endOffset; ++i) {
|
||||
if (i != startOffset) result.append(' ');
|
||||
result.append(toString(contents[i]));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static String toString(int ce) {
|
||||
return "[" + Utility.hex(UCA.getPrimary(ce)) + "."
|
||||
+ Utility.hex(UCA.getSecondary(ce)) + "."
|
||||
+ Utility.hex(UCA.getTertiary(ce)) + "](" + NAME3[UCA.getTertiary(ce)] + ")";
|
||||
}
|
||||
|
||||
static final String[] NAME3 = {
|
||||
"IGNORE", // 0
|
||||
"BLK", // Unused?
|
||||
"MIN",
|
||||
"WIDE",
|
||||
"COMPAT",
|
||||
"FONT",
|
||||
"CIRCLE",
|
||||
"RES-2",
|
||||
"CAP",
|
||||
"WIDECAP",
|
||||
"COMPATCAP",
|
||||
"FONTCAP",
|
||||
"CIRCLECAP",
|
||||
"HIRA-SMALL",
|
||||
"HIRA",
|
||||
"SMALL",
|
||||
"SMALL-NARROW",
|
||||
"KATA",
|
||||
"NARROW",
|
||||
"CIRCLE-KATA",
|
||||
"SUP-MNN",
|
||||
"SUB-MNS",
|
||||
"VERT", // Missing??
|
||||
"AINI",
|
||||
"AMED",
|
||||
"AFIN",
|
||||
"AISO",
|
||||
"NOBREAK", // Missing?
|
||||
"SQUARED",
|
||||
"SQUAREDCAP",
|
||||
"FRACTION",
|
||||
"MAX"
|
||||
};
|
||||
|
||||
// testing
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
/* This: [0241.0020.0004], that: [0F6B.0020.0002]
|
||||
1, 2, 1, 2
|
||||
0, 1, 1, 1
|
||||
*/
|
||||
CEList t1 = new CEList(new int[] {0, 0x02412004});
|
||||
t1 = t1.sub(1,2);
|
||||
CEList t2 = new CEList(new int[] {0x0F6B2002});
|
||||
System.out.println(t1.compareTo(t2));
|
||||
|
||||
|
||||
CEList foo = new CEList(new int[] {0, 1, 2, 3, 4});
|
||||
CEList fuu = new CEList(new int[] {});
|
||||
int cc = foo.compareTo(fuu);
|
||||
System.out.println(cc);
|
||||
|
||||
System.out.println(foo);
|
||||
System.out.println(foo.start(2));
|
||||
System.out.println(foo.end(1));
|
||||
CEList fii = new CEList(new int[] {2, 3});
|
||||
CEList foo2 = foo.sub(2,4);
|
||||
System.out.println(fii.equals(foo2));
|
||||
System.out.println(fii.compareTo(foo2));
|
||||
System.out.println(fii.compareTo(foo));
|
||||
System.out.println(fii.hashCode() == foo2.hashCode());
|
||||
|
||||
}
|
||||
}
|
||||
|
813
tools/unicodetools/com/ibm/text/UCA/Case.java
Normal file
813
tools/unicodetools/com/ibm/text/UCA/Case.java
Normal file
|
@ -0,0 +1,813 @@
|
|||
package com.ibm.text.UCA;
|
||||
|
||||
public final class Case {
|
||||
|
||||
static StringBuffer out = new StringBuffer();
|
||||
|
||||
static String fold(char c) {
|
||||
return fold(String.valueOf(c));
|
||||
}
|
||||
|
||||
static String fold(String in) {
|
||||
synchronized (out) {
|
||||
out.setLength(0);
|
||||
for (int i = 0; i < in.length(); ++i) {
|
||||
char c = in.charAt(i);
|
||||
String f = CF[c];
|
||||
if (f == null) out.append(c);
|
||||
else out.append(f);
|
||||
}
|
||||
return out.toString();
|
||||
}
|
||||
}
|
||||
|
||||
static String[] CF = new String[65536];
|
||||
static {
|
||||
CF[0x0041]="\u0061";
|
||||
CF[0x0042]="\u0062";
|
||||
CF[0x0043]="\u0063";
|
||||
CF[0x0044]="\u0064";
|
||||
CF[0x0045]="\u0065";
|
||||
CF[0x0046]="\u0066";
|
||||
CF[0x0047]="\u0067";
|
||||
CF[0x0048]="\u0068";
|
||||
CF[0x0049]="\u0069";
|
||||
CF[0x004A]="\u006A";
|
||||
CF[0x004B]="\u006B";
|
||||
CF[0x004C]="\u006C";
|
||||
CF[0x004D]="\u006D";
|
||||
CF[0x004E]="\u006E";
|
||||
CF[0x004F]="\u006F";
|
||||
CF[0x0050]="\u0070";
|
||||
CF[0x0051]="\u0071";
|
||||
CF[0x0052]="\u0072";
|
||||
CF[0x0053]="\u0073";
|
||||
CF[0x0054]="\u0074";
|
||||
CF[0x0055]="\u0075";
|
||||
CF[0x0056]="\u0076";
|
||||
CF[0x0057]="\u0077";
|
||||
CF[0x0058]="\u0078";
|
||||
CF[0x0059]="\u0079";
|
||||
CF[0x005A]="\u007A";
|
||||
CF[0x00B5]="\u03BC";
|
||||
CF[0x00C0]="\u00E0";
|
||||
CF[0x00C1]="\u00E1";
|
||||
CF[0x00C2]="\u00E2";
|
||||
CF[0x00C3]="\u00E3";
|
||||
CF[0x00C4]="\u00E4";
|
||||
CF[0x00C5]="\u00E5";
|
||||
CF[0x00C6]="\u00E6";
|
||||
CF[0x00C7]="\u00E7";
|
||||
CF[0x00C8]="\u00E8";
|
||||
CF[0x00C9]="\u00E9";
|
||||
CF[0x00CA]="\u00EA";
|
||||
CF[0x00CB]="\u00EB";
|
||||
CF[0x00CC]="\u00EC";
|
||||
CF[0x00CD]="\u00ED";
|
||||
CF[0x00CE]="\u00EE";
|
||||
CF[0x00CF]="\u00EF";
|
||||
CF[0x00D0]="\u00F0";
|
||||
CF[0x00D1]="\u00F1";
|
||||
CF[0x00D2]="\u00F2";
|
||||
CF[0x00D3]="\u00F3";
|
||||
CF[0x00D4]="\u00F4";
|
||||
CF[0x00D5]="\u00F5";
|
||||
CF[0x00D6]="\u00F6";
|
||||
CF[0x00D8]="\u00F8";
|
||||
CF[0x00D9]="\u00F9";
|
||||
CF[0x00DA]="\u00FA";
|
||||
CF[0x00DB]="\u00FB";
|
||||
CF[0x00DC]="\u00FC";
|
||||
CF[0x00DD]="\u00FD";
|
||||
CF[0x00DE]="\u00FE";
|
||||
CF[0x00DF]="\u0073\u0073";
|
||||
CF[0x0100]="\u0101";
|
||||
CF[0x0102]="\u0103";
|
||||
CF[0x0104]="\u0105";
|
||||
CF[0x0106]="\u0107";
|
||||
CF[0x0108]="\u0109";
|
||||
CF[0x010A]="\u010B";
|
||||
CF[0x010C]="\u010D";
|
||||
CF[0x010E]="\u010F";
|
||||
CF[0x0110]="\u0111";
|
||||
CF[0x0112]="\u0113";
|
||||
CF[0x0114]="\u0115";
|
||||
CF[0x0116]="\u0117";
|
||||
CF[0x0118]="\u0119";
|
||||
CF[0x011A]="\u011B";
|
||||
CF[0x011C]="\u011D";
|
||||
CF[0x011E]="\u011F";
|
||||
CF[0x0120]="\u0121";
|
||||
CF[0x0122]="\u0123";
|
||||
CF[0x0124]="\u0125";
|
||||
CF[0x0126]="\u0127";
|
||||
CF[0x0128]="\u0129";
|
||||
CF[0x012A]="\u012B";
|
||||
CF[0x012C]="\u012D";
|
||||
CF[0x012E]="\u012F";
|
||||
CF[0x0130]="\u0069";
|
||||
CF[0x0131]="\u0069";
|
||||
CF[0x0132]="\u0133";
|
||||
CF[0x0134]="\u0135";
|
||||
CF[0x0136]="\u0137";
|
||||
CF[0x0139]="\u013A";
|
||||
CF[0x013B]="\u013C";
|
||||
CF[0x013D]="\u013E";
|
||||
CF[0x013F]="\u0140";
|
||||
CF[0x0141]="\u0142";
|
||||
CF[0x0143]="\u0144";
|
||||
CF[0x0145]="\u0146";
|
||||
CF[0x0147]="\u0148";
|
||||
CF[0x0149]="\u02BC\u006E";
|
||||
CF[0x014A]="\u014B";
|
||||
CF[0x014C]="\u014D";
|
||||
CF[0x014E]="\u014F";
|
||||
CF[0x0150]="\u0151";
|
||||
CF[0x0152]="\u0153";
|
||||
CF[0x0154]="\u0155";
|
||||
CF[0x0156]="\u0157";
|
||||
CF[0x0158]="\u0159";
|
||||
CF[0x015A]="\u015B";
|
||||
CF[0x015C]="\u015D";
|
||||
CF[0x015E]="\u015F";
|
||||
CF[0x0160]="\u0161";
|
||||
CF[0x0162]="\u0163";
|
||||
CF[0x0164]="\u0165";
|
||||
CF[0x0166]="\u0167";
|
||||
CF[0x0168]="\u0169";
|
||||
CF[0x016A]="\u016B";
|
||||
CF[0x016C]="\u016D";
|
||||
CF[0x016E]="\u016F";
|
||||
CF[0x0170]="\u0171";
|
||||
CF[0x0172]="\u0173";
|
||||
CF[0x0174]="\u0175";
|
||||
CF[0x0176]="\u0177";
|
||||
CF[0x0178]="\u00FF";
|
||||
CF[0x0179]="\u017A";
|
||||
CF[0x017B]="\u017C";
|
||||
CF[0x017D]="\u017E";
|
||||
CF[0x017F]="\u0073";
|
||||
CF[0x0181]="\u0253";
|
||||
CF[0x0182]="\u0183";
|
||||
CF[0x0184]="\u0185";
|
||||
CF[0x0186]="\u0254";
|
||||
CF[0x0187]="\u0188";
|
||||
CF[0x0189]="\u0256";
|
||||
CF[0x018A]="\u0257";
|
||||
CF[0x018B]="\u018C";
|
||||
CF[0x018E]="\u01DD";
|
||||
CF[0x018F]="\u0259";
|
||||
CF[0x0190]="\u025B";
|
||||
CF[0x0191]="\u0192";
|
||||
CF[0x0193]="\u0260";
|
||||
CF[0x0194]="\u0263";
|
||||
CF[0x0196]="\u0269";
|
||||
CF[0x0197]="\u0268";
|
||||
CF[0x0198]="\u0199";
|
||||
CF[0x019C]="\u026F";
|
||||
CF[0x019D]="\u0272";
|
||||
CF[0x019F]="\u0275";
|
||||
CF[0x01A0]="\u01A1";
|
||||
CF[0x01A2]="\u01A3";
|
||||
CF[0x01A4]="\u01A5";
|
||||
CF[0x01A6]="\u0280";
|
||||
CF[0x01A7]="\u01A8";
|
||||
CF[0x01A9]="\u0283";
|
||||
CF[0x01AC]="\u01AD";
|
||||
CF[0x01AE]="\u0288";
|
||||
CF[0x01AF]="\u01B0";
|
||||
CF[0x01B1]="\u028A";
|
||||
CF[0x01B2]="\u028B";
|
||||
CF[0x01B3]="\u01B4";
|
||||
CF[0x01B5]="\u01B6";
|
||||
CF[0x01B7]="\u0292";
|
||||
CF[0x01B8]="\u01B9";
|
||||
CF[0x01BC]="\u01BD";
|
||||
CF[0x01C4]="\u01C6";
|
||||
CF[0x01C5]="\u01C6";
|
||||
CF[0x01C7]="\u01C9";
|
||||
CF[0x01C8]="\u01C9";
|
||||
CF[0x01CA]="\u01CC";
|
||||
CF[0x01CB]="\u01CC";
|
||||
CF[0x01CD]="\u01CE";
|
||||
CF[0x01CF]="\u01D0";
|
||||
CF[0x01D1]="\u01D2";
|
||||
CF[0x01D3]="\u01D4";
|
||||
CF[0x01D5]="\u01D6";
|
||||
CF[0x01D7]="\u01D8";
|
||||
CF[0x01D9]="\u01DA";
|
||||
CF[0x01DB]="\u01DC";
|
||||
CF[0x01DE]="\u01DF";
|
||||
CF[0x01E0]="\u01E1";
|
||||
CF[0x01E2]="\u01E3";
|
||||
CF[0x01E4]="\u01E5";
|
||||
CF[0x01E6]="\u01E7";
|
||||
CF[0x01E8]="\u01E9";
|
||||
CF[0x01EA]="\u01EB";
|
||||
CF[0x01EC]="\u01ED";
|
||||
CF[0x01EE]="\u01EF";
|
||||
CF[0x01F0]="\u006A\u030C";
|
||||
CF[0x01F1]="\u01F3";
|
||||
CF[0x01F2]="\u01F3";
|
||||
CF[0x01F4]="\u01F5";
|
||||
CF[0x01F6]="\u0195";
|
||||
CF[0x01F7]="\u01BF";
|
||||
CF[0x01F8]="\u01F9";
|
||||
CF[0x01FA]="\u01FB";
|
||||
CF[0x01FC]="\u01FD";
|
||||
CF[0x01FE]="\u01FF";
|
||||
CF[0x0200]="\u0201";
|
||||
CF[0x0202]="\u0203";
|
||||
CF[0x0204]="\u0205";
|
||||
CF[0x0206]="\u0207";
|
||||
CF[0x0208]="\u0209";
|
||||
CF[0x020A]="\u020B";
|
||||
CF[0x020C]="\u020D";
|
||||
CF[0x020E]="\u020F";
|
||||
CF[0x0210]="\u0211";
|
||||
CF[0x0212]="\u0213";
|
||||
CF[0x0214]="\u0215";
|
||||
CF[0x0216]="\u0217";
|
||||
CF[0x0218]="\u0219";
|
||||
CF[0x021A]="\u021B";
|
||||
CF[0x021C]="\u021D";
|
||||
CF[0x021E]="\u021F";
|
||||
CF[0x0222]="\u0223";
|
||||
CF[0x0224]="\u0225";
|
||||
CF[0x0226]="\u0227";
|
||||
CF[0x0228]="\u0229";
|
||||
CF[0x022A]="\u022B";
|
||||
CF[0x022C]="\u022D";
|
||||
CF[0x022E]="\u022F";
|
||||
CF[0x0230]="\u0231";
|
||||
CF[0x0232]="\u0233";
|
||||
CF[0x0345]="\u03B9";
|
||||
CF[0x0386]="\u03AC";
|
||||
CF[0x0388]="\u03AD";
|
||||
CF[0x0389]="\u03AE";
|
||||
CF[0x038A]="\u03AF";
|
||||
CF[0x038C]="\u03CC";
|
||||
CF[0x038E]="\u03CD";
|
||||
CF[0x038F]="\u03CE";
|
||||
CF[0x0390]="\u03B9\u0308\u0301";
|
||||
CF[0x0391]="\u03B1";
|
||||
CF[0x0392]="\u03B2";
|
||||
CF[0x0393]="\u03B3";
|
||||
CF[0x0394]="\u03B4";
|
||||
CF[0x0395]="\u03B5";
|
||||
CF[0x0396]="\u03B6";
|
||||
CF[0x0397]="\u03B7";
|
||||
CF[0x0398]="\u03B8";
|
||||
CF[0x0399]="\u03B9";
|
||||
CF[0x039A]="\u03BA";
|
||||
CF[0x039B]="\u03BB";
|
||||
CF[0x039C]="\u03BC";
|
||||
CF[0x039D]="\u03BD";
|
||||
CF[0x039E]="\u03BE";
|
||||
CF[0x039F]="\u03BF";
|
||||
CF[0x03A0]="\u03C0";
|
||||
CF[0x03A1]="\u03C1";
|
||||
CF[0x03A3]="\u03C2";
|
||||
CF[0x03A4]="\u03C4";
|
||||
CF[0x03A5]="\u03C5";
|
||||
CF[0x03A6]="\u03C6";
|
||||
CF[0x03A7]="\u03C7";
|
||||
CF[0x03A8]="\u03C8";
|
||||
CF[0x03A9]="\u03C9";
|
||||
CF[0x03AA]="\u03CA";
|
||||
CF[0x03AB]="\u03CB";
|
||||
CF[0x03B0]="\u03C5\u0308\u0301";
|
||||
CF[0x03C3]="\u03C2";
|
||||
CF[0x03D0]="\u03B2";
|
||||
CF[0x03D1]="\u03B8";
|
||||
CF[0x03D5]="\u03C6";
|
||||
CF[0x03D6]="\u03C0";
|
||||
CF[0x03DA]="\u03DB";
|
||||
CF[0x03DC]="\u03DD";
|
||||
CF[0x03DE]="\u03DF";
|
||||
CF[0x03E0]="\u03E1";
|
||||
CF[0x03E2]="\u03E3";
|
||||
CF[0x03E4]="\u03E5";
|
||||
CF[0x03E6]="\u03E7";
|
||||
CF[0x03E8]="\u03E9";
|
||||
CF[0x03EA]="\u03EB";
|
||||
CF[0x03EC]="\u03ED";
|
||||
CF[0x03EE]="\u03EF";
|
||||
CF[0x03F0]="\u03BA";
|
||||
CF[0x03F1]="\u03C1";
|
||||
CF[0x03F2]="\u03C2";
|
||||
CF[0x0400]="\u0450";
|
||||
CF[0x0401]="\u0451";
|
||||
CF[0x0402]="\u0452";
|
||||
CF[0x0403]="\u0453";
|
||||
CF[0x0404]="\u0454";
|
||||
CF[0x0405]="\u0455";
|
||||
CF[0x0406]="\u0456";
|
||||
CF[0x0407]="\u0457";
|
||||
CF[0x0408]="\u0458";
|
||||
CF[0x0409]="\u0459";
|
||||
CF[0x040A]="\u045A";
|
||||
CF[0x040B]="\u045B";
|
||||
CF[0x040C]="\u045C";
|
||||
CF[0x040D]="\u045D";
|
||||
CF[0x040E]="\u045E";
|
||||
CF[0x040F]="\u045F";
|
||||
CF[0x0410]="\u0430";
|
||||
CF[0x0411]="\u0431";
|
||||
CF[0x0412]="\u0432";
|
||||
CF[0x0413]="\u0433";
|
||||
CF[0x0414]="\u0434";
|
||||
CF[0x0415]="\u0435";
|
||||
CF[0x0416]="\u0436";
|
||||
CF[0x0417]="\u0437";
|
||||
CF[0x0418]="\u0438";
|
||||
CF[0x0419]="\u0439";
|
||||
CF[0x041A]="\u043A";
|
||||
CF[0x041B]="\u043B";
|
||||
CF[0x041C]="\u043C";
|
||||
CF[0x041D]="\u043D";
|
||||
CF[0x041E]="\u043E";
|
||||
CF[0x041F]="\u043F";
|
||||
CF[0x0420]="\u0440";
|
||||
CF[0x0421]="\u0441";
|
||||
CF[0x0422]="\u0442";
|
||||
CF[0x0423]="\u0443";
|
||||
CF[0x0424]="\u0444";
|
||||
CF[0x0425]="\u0445";
|
||||
CF[0x0426]="\u0446";
|
||||
CF[0x0427]="\u0447";
|
||||
CF[0x0428]="\u0448";
|
||||
CF[0x0429]="\u0449";
|
||||
CF[0x042A]="\u044A";
|
||||
CF[0x042B]="\u044B";
|
||||
CF[0x042C]="\u044C";
|
||||
CF[0x042D]="\u044D";
|
||||
CF[0x042E]="\u044E";
|
||||
CF[0x042F]="\u044F";
|
||||
CF[0x0460]="\u0461";
|
||||
CF[0x0462]="\u0463";
|
||||
CF[0x0464]="\u0465";
|
||||
CF[0x0466]="\u0467";
|
||||
CF[0x0468]="\u0469";
|
||||
CF[0x046A]="\u046B";
|
||||
CF[0x046C]="\u046D";
|
||||
CF[0x046E]="\u046F";
|
||||
CF[0x0470]="\u0471";
|
||||
CF[0x0472]="\u0473";
|
||||
CF[0x0474]="\u0475";
|
||||
CF[0x0476]="\u0477";
|
||||
CF[0x0478]="\u0479";
|
||||
CF[0x047A]="\u047B";
|
||||
CF[0x047C]="\u047D";
|
||||
CF[0x047E]="\u047F";
|
||||
CF[0x0480]="\u0481";
|
||||
CF[0x048C]="\u048D";
|
||||
CF[0x048E]="\u048F";
|
||||
CF[0x0490]="\u0491";
|
||||
CF[0x0492]="\u0493";
|
||||
CF[0x0494]="\u0495";
|
||||
CF[0x0496]="\u0497";
|
||||
CF[0x0498]="\u0499";
|
||||
CF[0x049A]="\u049B";
|
||||
CF[0x049C]="\u049D";
|
||||
CF[0x049E]="\u049F";
|
||||
CF[0x04A0]="\u04A1";
|
||||
CF[0x04A2]="\u04A3";
|
||||
CF[0x04A4]="\u04A5";
|
||||
CF[0x04A6]="\u04A7";
|
||||
CF[0x04A8]="\u04A9";
|
||||
CF[0x04AA]="\u04AB";
|
||||
CF[0x04AC]="\u04AD";
|
||||
CF[0x04AE]="\u04AF";
|
||||
CF[0x04B0]="\u04B1";
|
||||
CF[0x04B2]="\u04B3";
|
||||
CF[0x04B4]="\u04B5";
|
||||
CF[0x04B6]="\u04B7";
|
||||
CF[0x04B8]="\u04B9";
|
||||
CF[0x04BA]="\u04BB";
|
||||
CF[0x04BC]="\u04BD";
|
||||
CF[0x04BE]="\u04BF";
|
||||
CF[0x04C1]="\u04C2";
|
||||
CF[0x04C3]="\u04C4";
|
||||
CF[0x04C7]="\u04C8";
|
||||
CF[0x04CB]="\u04CC";
|
||||
CF[0x04D0]="\u04D1";
|
||||
CF[0x04D2]="\u04D3";
|
||||
CF[0x04D4]="\u04D5";
|
||||
CF[0x04D6]="\u04D7";
|
||||
CF[0x04D8]="\u04D9";
|
||||
CF[0x04DA]="\u04DB";
|
||||
CF[0x04DC]="\u04DD";
|
||||
CF[0x04DE]="\u04DF";
|
||||
CF[0x04E0]="\u04E1";
|
||||
CF[0x04E2]="\u04E3";
|
||||
CF[0x04E4]="\u04E5";
|
||||
CF[0x04E6]="\u04E7";
|
||||
CF[0x04E8]="\u04E9";
|
||||
CF[0x04EA]="\u04EB";
|
||||
CF[0x04EC]="\u04ED";
|
||||
CF[0x04EE]="\u04EF";
|
||||
CF[0x04F0]="\u04F1";
|
||||
CF[0x04F2]="\u04F3";
|
||||
CF[0x04F4]="\u04F5";
|
||||
CF[0x04F8]="\u04F9";
|
||||
CF[0x0531]="\u0561";
|
||||
CF[0x0532]="\u0562";
|
||||
CF[0x0533]="\u0563";
|
||||
CF[0x0534]="\u0564";
|
||||
CF[0x0535]="\u0565";
|
||||
CF[0x0536]="\u0566";
|
||||
CF[0x0537]="\u0567";
|
||||
CF[0x0538]="\u0568";
|
||||
CF[0x0539]="\u0569";
|
||||
CF[0x053A]="\u056A";
|
||||
CF[0x053B]="\u056B";
|
||||
CF[0x053C]="\u056C";
|
||||
CF[0x053D]="\u056D";
|
||||
CF[0x053E]="\u056E";
|
||||
CF[0x053F]="\u056F";
|
||||
CF[0x0540]="\u0570";
|
||||
CF[0x0541]="\u0571";
|
||||
CF[0x0542]="\u0572";
|
||||
CF[0x0543]="\u0573";
|
||||
CF[0x0544]="\u0574";
|
||||
CF[0x0545]="\u0575";
|
||||
CF[0x0546]="\u0576";
|
||||
CF[0x0547]="\u0577";
|
||||
CF[0x0548]="\u0578";
|
||||
CF[0x0549]="\u0579";
|
||||
CF[0x054A]="\u057A";
|
||||
CF[0x054B]="\u057B";
|
||||
CF[0x054C]="\u057C";
|
||||
CF[0x054D]="\u057D";
|
||||
CF[0x054E]="\u057E";
|
||||
CF[0x054F]="\u057F";
|
||||
CF[0x0550]="\u0580";
|
||||
CF[0x0551]="\u0581";
|
||||
CF[0x0552]="\u0582";
|
||||
CF[0x0553]="\u0583";
|
||||
CF[0x0554]="\u0584";
|
||||
CF[0x0555]="\u0585";
|
||||
CF[0x0556]="\u0586";
|
||||
CF[0x0587]="\u0565\u0582";
|
||||
CF[0x1E00]="\u1E01";
|
||||
CF[0x1E02]="\u1E03";
|
||||
CF[0x1E04]="\u1E05";
|
||||
CF[0x1E06]="\u1E07";
|
||||
CF[0x1E08]="\u1E09";
|
||||
CF[0x1E0A]="\u1E0B";
|
||||
CF[0x1E0C]="\u1E0D";
|
||||
CF[0x1E0E]="\u1E0F";
|
||||
CF[0x1E10]="\u1E11";
|
||||
CF[0x1E12]="\u1E13";
|
||||
CF[0x1E14]="\u1E15";
|
||||
CF[0x1E16]="\u1E17";
|
||||
CF[0x1E18]="\u1E19";
|
||||
CF[0x1E1A]="\u1E1B";
|
||||
CF[0x1E1C]="\u1E1D";
|
||||
CF[0x1E1E]="\u1E1F";
|
||||
CF[0x1E20]="\u1E21";
|
||||
CF[0x1E22]="\u1E23";
|
||||
CF[0x1E24]="\u1E25";
|
||||
CF[0x1E26]="\u1E27";
|
||||
CF[0x1E28]="\u1E29";
|
||||
CF[0x1E2A]="\u1E2B";
|
||||
CF[0x1E2C]="\u1E2D";
|
||||
CF[0x1E2E]="\u1E2F";
|
||||
CF[0x1E30]="\u1E31";
|
||||
CF[0x1E32]="\u1E33";
|
||||
CF[0x1E34]="\u1E35";
|
||||
CF[0x1E36]="\u1E37";
|
||||
CF[0x1E38]="\u1E39";
|
||||
CF[0x1E3A]="\u1E3B";
|
||||
CF[0x1E3C]="\u1E3D";
|
||||
CF[0x1E3E]="\u1E3F";
|
||||
CF[0x1E40]="\u1E41";
|
||||
CF[0x1E42]="\u1E43";
|
||||
CF[0x1E44]="\u1E45";
|
||||
CF[0x1E46]="\u1E47";
|
||||
CF[0x1E48]="\u1E49";
|
||||
CF[0x1E4A]="\u1E4B";
|
||||
CF[0x1E4C]="\u1E4D";
|
||||
CF[0x1E4E]="\u1E4F";
|
||||
CF[0x1E50]="\u1E51";
|
||||
CF[0x1E52]="\u1E53";
|
||||
CF[0x1E54]="\u1E55";
|
||||
CF[0x1E56]="\u1E57";
|
||||
CF[0x1E58]="\u1E59";
|
||||
CF[0x1E5A]="\u1E5B";
|
||||
CF[0x1E5C]="\u1E5D";
|
||||
CF[0x1E5E]="\u1E5F";
|
||||
CF[0x1E60]="\u1E61";
|
||||
CF[0x1E62]="\u1E63";
|
||||
CF[0x1E64]="\u1E65";
|
||||
CF[0x1E66]="\u1E67";
|
||||
CF[0x1E68]="\u1E69";
|
||||
CF[0x1E6A]="\u1E6B";
|
||||
CF[0x1E6C]="\u1E6D";
|
||||
CF[0x1E6E]="\u1E6F";
|
||||
CF[0x1E70]="\u1E71";
|
||||
CF[0x1E72]="\u1E73";
|
||||
CF[0x1E74]="\u1E75";
|
||||
CF[0x1E76]="\u1E77";
|
||||
CF[0x1E78]="\u1E79";
|
||||
CF[0x1E7A]="\u1E7B";
|
||||
CF[0x1E7C]="\u1E7D";
|
||||
CF[0x1E7E]="\u1E7F";
|
||||
CF[0x1E80]="\u1E81";
|
||||
CF[0x1E82]="\u1E83";
|
||||
CF[0x1E84]="\u1E85";
|
||||
CF[0x1E86]="\u1E87";
|
||||
CF[0x1E88]="\u1E89";
|
||||
CF[0x1E8A]="\u1E8B";
|
||||
CF[0x1E8C]="\u1E8D";
|
||||
CF[0x1E8E]="\u1E8F";
|
||||
CF[0x1E90]="\u1E91";
|
||||
CF[0x1E92]="\u1E93";
|
||||
CF[0x1E94]="\u1E95";
|
||||
CF[0x1E96]="\u0068\u0331";
|
||||
CF[0x1E97]="\u0074\u0308";
|
||||
CF[0x1E98]="\u0077\u030A";
|
||||
CF[0x1E99]="\u0079\u030A";
|
||||
CF[0x1E9A]="\u0061\u02BE";
|
||||
CF[0x1E9B]="\u1E61";
|
||||
CF[0x1EA0]="\u1EA1";
|
||||
CF[0x1EA2]="\u1EA3";
|
||||
CF[0x1EA4]="\u1EA5";
|
||||
CF[0x1EA6]="\u1EA7";
|
||||
CF[0x1EA8]="\u1EA9";
|
||||
CF[0x1EAA]="\u1EAB";
|
||||
CF[0x1EAC]="\u1EAD";
|
||||
CF[0x1EAE]="\u1EAF";
|
||||
CF[0x1EB0]="\u1EB1";
|
||||
CF[0x1EB2]="\u1EB3";
|
||||
CF[0x1EB4]="\u1EB5";
|
||||
CF[0x1EB6]="\u1EB7";
|
||||
CF[0x1EB8]="\u1EB9";
|
||||
CF[0x1EBA]="\u1EBB";
|
||||
CF[0x1EBC]="\u1EBD";
|
||||
CF[0x1EBE]="\u1EBF";
|
||||
CF[0x1EC0]="\u1EC1";
|
||||
CF[0x1EC2]="\u1EC3";
|
||||
CF[0x1EC4]="\u1EC5";
|
||||
CF[0x1EC6]="\u1EC7";
|
||||
CF[0x1EC8]="\u1EC9";
|
||||
CF[0x1ECA]="\u1ECB";
|
||||
CF[0x1ECC]="\u1ECD";
|
||||
CF[0x1ECE]="\u1ECF";
|
||||
CF[0x1ED0]="\u1ED1";
|
||||
CF[0x1ED2]="\u1ED3";
|
||||
CF[0x1ED4]="\u1ED5";
|
||||
CF[0x1ED6]="\u1ED7";
|
||||
CF[0x1ED8]="\u1ED9";
|
||||
CF[0x1EDA]="\u1EDB";
|
||||
CF[0x1EDC]="\u1EDD";
|
||||
CF[0x1EDE]="\u1EDF";
|
||||
CF[0x1EE0]="\u1EE1";
|
||||
CF[0x1EE2]="\u1EE3";
|
||||
CF[0x1EE4]="\u1EE5";
|
||||
CF[0x1EE6]="\u1EE7";
|
||||
CF[0x1EE8]="\u1EE9";
|
||||
CF[0x1EEA]="\u1EEB";
|
||||
CF[0x1EEC]="\u1EED";
|
||||
CF[0x1EEE]="\u1EEF";
|
||||
CF[0x1EF0]="\u1EF1";
|
||||
CF[0x1EF2]="\u1EF3";
|
||||
CF[0x1EF4]="\u1EF5";
|
||||
CF[0x1EF6]="\u1EF7";
|
||||
CF[0x1EF8]="\u1EF9";
|
||||
CF[0x1F08]="\u1F00";
|
||||
CF[0x1F09]="\u1F01";
|
||||
CF[0x1F0A]="\u1F02";
|
||||
CF[0x1F0B]="\u1F03";
|
||||
CF[0x1F0C]="\u1F04";
|
||||
CF[0x1F0D]="\u1F05";
|
||||
CF[0x1F0E]="\u1F06";
|
||||
CF[0x1F0F]="\u1F07";
|
||||
CF[0x1F18]="\u1F10";
|
||||
CF[0x1F19]="\u1F11";
|
||||
CF[0x1F1A]="\u1F12";
|
||||
CF[0x1F1B]="\u1F13";
|
||||
CF[0x1F1C]="\u1F14";
|
||||
CF[0x1F1D]="\u1F15";
|
||||
CF[0x1F28]="\u1F20";
|
||||
CF[0x1F29]="\u1F21";
|
||||
CF[0x1F2A]="\u1F22";
|
||||
CF[0x1F2B]="\u1F23";
|
||||
CF[0x1F2C]="\u1F24";
|
||||
CF[0x1F2D]="\u1F25";
|
||||
CF[0x1F2E]="\u1F26";
|
||||
CF[0x1F2F]="\u1F27";
|
||||
CF[0x1F38]="\u1F30";
|
||||
CF[0x1F39]="\u1F31";
|
||||
CF[0x1F3A]="\u1F32";
|
||||
CF[0x1F3B]="\u1F33";
|
||||
CF[0x1F3C]="\u1F34";
|
||||
CF[0x1F3D]="\u1F35";
|
||||
CF[0x1F3E]="\u1F36";
|
||||
CF[0x1F3F]="\u1F37";
|
||||
CF[0x1F48]="\u1F40";
|
||||
CF[0x1F49]="\u1F41";
|
||||
CF[0x1F4A]="\u1F42";
|
||||
CF[0x1F4B]="\u1F43";
|
||||
CF[0x1F4C]="\u1F44";
|
||||
CF[0x1F4D]="\u1F45";
|
||||
CF[0x1F50]="\u03C5\u0313";
|
||||
CF[0x1F52]="\u03C5\u0313\u0300";
|
||||
CF[0x1F54]="\u03C5\u0313\u0301";
|
||||
CF[0x1F56]="\u03C5\u0313\u0342";
|
||||
CF[0x1F59]="\u1F51";
|
||||
CF[0x1F5B]="\u1F53";
|
||||
CF[0x1F5D]="\u1F55";
|
||||
CF[0x1F5F]="\u1F57";
|
||||
CF[0x1F68]="\u1F60";
|
||||
CF[0x1F69]="\u1F61";
|
||||
CF[0x1F6A]="\u1F62";
|
||||
CF[0x1F6B]="\u1F63";
|
||||
CF[0x1F6C]="\u1F64";
|
||||
CF[0x1F6D]="\u1F65";
|
||||
CF[0x1F6E]="\u1F66";
|
||||
CF[0x1F6F]="\u1F67";
|
||||
CF[0x1F80]="\u1F00\u03B9";
|
||||
CF[0x1F81]="\u1F01\u03B9";
|
||||
CF[0x1F82]="\u1F02\u03B9";
|
||||
CF[0x1F83]="\u1F03\u03B9";
|
||||
CF[0x1F84]="\u1F04\u03B9";
|
||||
CF[0x1F85]="\u1F05\u03B9";
|
||||
CF[0x1F86]="\u1F06\u03B9";
|
||||
CF[0x1F87]="\u1F07\u03B9";
|
||||
CF[0x1F88]="\u1F00\u03B9";
|
||||
CF[0x1F89]="\u1F01\u03B9";
|
||||
CF[0x1F8A]="\u1F02\u03B9";
|
||||
CF[0x1F8B]="\u1F03\u03B9";
|
||||
CF[0x1F8C]="\u1F04\u03B9";
|
||||
CF[0x1F8D]="\u1F05\u03B9";
|
||||
CF[0x1F8E]="\u1F06\u03B9";
|
||||
CF[0x1F8F]="\u1F07\u03B9";
|
||||
CF[0x1F90]="\u1F20\u03B9";
|
||||
CF[0x1F91]="\u1F21\u03B9";
|
||||
CF[0x1F92]="\u1F22\u03B9";
|
||||
CF[0x1F93]="\u1F23\u03B9";
|
||||
CF[0x1F94]="\u1F24\u03B9";
|
||||
CF[0x1F95]="\u1F25\u03B9";
|
||||
CF[0x1F96]="\u1F26\u03B9";
|
||||
CF[0x1F97]="\u1F27\u03B9";
|
||||
CF[0x1F98]="\u1F20\u03B9";
|
||||
CF[0x1F99]="\u1F21\u03B9";
|
||||
CF[0x1F9A]="\u1F22\u03B9";
|
||||
CF[0x1F9B]="\u1F23\u03B9";
|
||||
CF[0x1F9C]="\u1F24\u03B9";
|
||||
CF[0x1F9D]="\u1F25\u03B9";
|
||||
CF[0x1F9E]="\u1F26\u03B9";
|
||||
CF[0x1F9F]="\u1F27\u03B9";
|
||||
CF[0x1FA0]="\u1F60\u03B9";
|
||||
CF[0x1FA1]="\u1F61\u03B9";
|
||||
CF[0x1FA2]="\u1F62\u03B9";
|
||||
CF[0x1FA3]="\u1F63\u03B9";
|
||||
CF[0x1FA4]="\u1F64\u03B9";
|
||||
CF[0x1FA5]="\u1F65\u03B9";
|
||||
CF[0x1FA6]="\u1F66\u03B9";
|
||||
CF[0x1FA7]="\u1F67\u03B9";
|
||||
CF[0x1FA8]="\u1F60\u03B9";
|
||||
CF[0x1FA9]="\u1F61\u03B9";
|
||||
CF[0x1FAA]="\u1F62\u03B9";
|
||||
CF[0x1FAB]="\u1F63\u03B9";
|
||||
CF[0x1FAC]="\u1F64\u03B9";
|
||||
CF[0x1FAD]="\u1F65\u03B9";
|
||||
CF[0x1FAE]="\u1F66\u03B9";
|
||||
CF[0x1FAF]="\u1F67\u03B9";
|
||||
CF[0x1FB2]="\u1F70\u03B9";
|
||||
CF[0x1FB3]="\u03B1\u03B9";
|
||||
CF[0x1FB4]="\u03AC\u03B9";
|
||||
CF[0x1FB6]="\u03B1\u0342";
|
||||
CF[0x1FB7]="\u03B1\u0342\u03B9";
|
||||
CF[0x1FB8]="\u1FB0";
|
||||
CF[0x1FB9]="\u1FB1";
|
||||
CF[0x1FBA]="\u1F70";
|
||||
CF[0x1FBB]="\u1F71";
|
||||
CF[0x1FBC]="\u03B1\u03B9";
|
||||
CF[0x1FBE]="\u03B9";
|
||||
CF[0x1FC2]="\u1F74\u03B9";
|
||||
CF[0x1FC3]="\u03B7\u03B9";
|
||||
CF[0x1FC4]="\u03AE\u03B9";
|
||||
CF[0x1FC6]="\u03B7\u0342";
|
||||
CF[0x1FC7]="\u03B7\u0342\u03B9";
|
||||
CF[0x1FC8]="\u1F72";
|
||||
CF[0x1FC9]="\u1F73";
|
||||
CF[0x1FCA]="\u1F74";
|
||||
CF[0x1FCB]="\u1F75";
|
||||
CF[0x1FCC]="\u03B7\u03B9";
|
||||
CF[0x1FD2]="\u03B9\u0308\u0300";
|
||||
CF[0x1FD3]="\u03B9\u0308\u0301";
|
||||
CF[0x1FD6]="\u03B9\u0342";
|
||||
CF[0x1FD7]="\u03B9\u0308\u0342";
|
||||
CF[0x1FD8]="\u1FD0";
|
||||
CF[0x1FD9]="\u1FD1";
|
||||
CF[0x1FDA]="\u1F76";
|
||||
CF[0x1FDB]="\u1F77";
|
||||
CF[0x1FE2]="\u03C5\u0308\u0300";
|
||||
CF[0x1FE3]="\u03C5\u0308\u0301";
|
||||
CF[0x1FE4]="\u03C1\u0313";
|
||||
CF[0x1FE6]="\u03C5\u0342";
|
||||
CF[0x1FE7]="\u03C5\u0308\u0342";
|
||||
CF[0x1FE8]="\u1FE0";
|
||||
CF[0x1FE9]="\u1FE1";
|
||||
CF[0x1FEA]="\u1F7A";
|
||||
CF[0x1FEB]="\u1F7B";
|
||||
CF[0x1FEC]="\u1FE5";
|
||||
CF[0x1FF2]="\u1F7C\u03B9";
|
||||
CF[0x1FF3]="\u03C9\u03B9";
|
||||
CF[0x1FF4]="\u03CE\u03B9";
|
||||
CF[0x1FF6]="\u03C9\u0342";
|
||||
CF[0x1FF7]="\u03C9\u0342\u03B9";
|
||||
CF[0x1FF8]="\u1F78";
|
||||
CF[0x1FF9]="\u1F79";
|
||||
CF[0x1FFA]="\u1F7C";
|
||||
CF[0x1FFB]="\u1F7D";
|
||||
CF[0x1FFC]="\u03C9\u03B9";
|
||||
CF[0x2126]="\u03C9";
|
||||
CF[0x212A]="\u006B";
|
||||
CF[0x212B]="\u00E5";
|
||||
CF[0x2160]="\u2170";
|
||||
CF[0x2161]="\u2171";
|
||||
CF[0x2162]="\u2172";
|
||||
CF[0x2163]="\u2173";
|
||||
CF[0x2164]="\u2174";
|
||||
CF[0x2165]="\u2175";
|
||||
CF[0x2166]="\u2176";
|
||||
CF[0x2167]="\u2177";
|
||||
CF[0x2168]="\u2178";
|
||||
CF[0x2169]="\u2179";
|
||||
CF[0x216A]="\u217A";
|
||||
CF[0x216B]="\u217B";
|
||||
CF[0x216C]="\u217C";
|
||||
CF[0x216D]="\u217D";
|
||||
CF[0x216E]="\u217E";
|
||||
CF[0x216F]="\u217F";
|
||||
CF[0x24B6]="\u24D0";
|
||||
CF[0x24B7]="\u24D1";
|
||||
CF[0x24B8]="\u24D2";
|
||||
CF[0x24B9]="\u24D3";
|
||||
CF[0x24BA]="\u24D4";
|
||||
CF[0x24BB]="\u24D5";
|
||||
CF[0x24BC]="\u24D6";
|
||||
CF[0x24BD]="\u24D7";
|
||||
CF[0x24BE]="\u24D8";
|
||||
CF[0x24BF]="\u24D9";
|
||||
CF[0x24C0]="\u24DA";
|
||||
CF[0x24C1]="\u24DB";
|
||||
CF[0x24C2]="\u24DC";
|
||||
CF[0x24C3]="\u24DD";
|
||||
CF[0x24C4]="\u24DE";
|
||||
CF[0x24C5]="\u24DF";
|
||||
CF[0x24C6]="\u24E0";
|
||||
CF[0x24C7]="\u24E1";
|
||||
CF[0x24C8]="\u24E2";
|
||||
CF[0x24C9]="\u24E3";
|
||||
CF[0x24CA]="\u24E4";
|
||||
CF[0x24CB]="\u24E5";
|
||||
CF[0x24CC]="\u24E6";
|
||||
CF[0x24CD]="\u24E7";
|
||||
CF[0x24CE]="\u24E8";
|
||||
CF[0x24CF]="\u24E9";
|
||||
CF[0xFB00]="\u0066\u0066";
|
||||
CF[0xFB01]="\u0066\u0069";
|
||||
CF[0xFB02]="\u0066\u006C";
|
||||
CF[0xFB03]="\u0066\u0066\u0069";
|
||||
CF[0xFB04]="\u0066\u0066\u006C";
|
||||
CF[0xFB05]="\u0073\u0074";
|
||||
CF[0xFB06]="\u0073\u0074";
|
||||
CF[0xFB13]="\u0574\u0576";
|
||||
CF[0xFB14]="\u0574\u0565";
|
||||
CF[0xFB15]="\u0574\u056B";
|
||||
CF[0xFB16]="\u057E\u0576";
|
||||
CF[0xFB17]="\u0574\u056D";
|
||||
CF[0xFF21]="\uFF41";
|
||||
CF[0xFF22]="\uFF42";
|
||||
CF[0xFF23]="\uFF43";
|
||||
CF[0xFF24]="\uFF44";
|
||||
CF[0xFF25]="\uFF45";
|
||||
CF[0xFF26]="\uFF46";
|
||||
CF[0xFF27]="\uFF47";
|
||||
CF[0xFF28]="\uFF48";
|
||||
CF[0xFF29]="\uFF49";
|
||||
CF[0xFF2A]="\uFF4A";
|
||||
CF[0xFF2B]="\uFF4B";
|
||||
CF[0xFF2C]="\uFF4C";
|
||||
CF[0xFF2D]="\uFF4D";
|
||||
CF[0xFF2E]="\uFF4E";
|
||||
CF[0xFF2F]="\uFF4F";
|
||||
CF[0xFF30]="\uFF50";
|
||||
CF[0xFF31]="\uFF51";
|
||||
CF[0xFF32]="\uFF52";
|
||||
CF[0xFF33]="\uFF53";
|
||||
CF[0xFF34]="\uFF54";
|
||||
CF[0xFF35]="\uFF55";
|
||||
CF[0xFF36]="\uFF56";
|
||||
CF[0xFF37]="\uFF57";
|
||||
CF[0xFF38]="\uFF58";
|
||||
CF[0xFF39]="\uFF59";
|
||||
CF[0xFF3A]="\uFF5A";
|
||||
// 785 case foldings total
|
||||
}
|
||||
}
|
490
tools/unicodetools/com/ibm/text/UCA/GenOverlap.java
Normal file
490
tools/unicodetools/com/ibm/text/UCA/GenOverlap.java
Normal file
|
@ -0,0 +1,490 @@
|
|||
package com.ibm.text.UCA;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
import com.ibm.text.UCD.*;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.text.UTF16;
|
||||
|
||||
public class GenOverlap {
|
||||
|
||||
static Map completes = new TreeMap();
|
||||
static Map back = new HashMap();
|
||||
static Map initials = new HashMap();
|
||||
static int[] ces = new int[50];
|
||||
static UCA collator;
|
||||
static UCD ucd;
|
||||
static Normalizer nfd;
|
||||
static Normalizer nfkd;
|
||||
|
||||
public static void test(UCA collatorIn) throws Exception {
|
||||
collator = collatorIn;
|
||||
|
||||
CEList.main(null);
|
||||
|
||||
System.out.println("# Overlap");
|
||||
System.out.println("# Generated " + new Date());
|
||||
|
||||
ucd = UCD.make();
|
||||
|
||||
nfd = new Normalizer(Normalizer.NFD);
|
||||
nfkd = new Normalizer(Normalizer.NFKD);
|
||||
|
||||
UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
|
||||
|
||||
// store data for faster lookup
|
||||
|
||||
System.out.println("# Gathering Data");
|
||||
int counter = 0;
|
||||
|
||||
int[] lenArray = new int[1];
|
||||
|
||||
while (true) {
|
||||
|
||||
Utility.dot(counter++);
|
||||
String s = cc.next(ces, lenArray);
|
||||
if (s == null) break;
|
||||
int len = lenArray[0];
|
||||
|
||||
CEList currCEList = new CEList(ces, 0, len);
|
||||
addString(s, currCEList);
|
||||
}
|
||||
|
||||
for (int cp = 0x10000; cp <= 0x10FFFF; ++cp) {
|
||||
if (!ucd.isRepresented(cp)) continue;
|
||||
byte decompType = ucd.getDecompositionType(cp);
|
||||
if (decompType >= UCD.COMPATIBILITY) {
|
||||
String decomp = nfkd.normalize(cp);
|
||||
CEList celist = getCEList(cp, decomp, decompType);
|
||||
addString(decomp, celist);
|
||||
System.out.println("Adding: " + ucd.getCodeAndName(cp) + "\t" + celist);
|
||||
}
|
||||
}
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("# Completes Count: " + completes.size());
|
||||
System.out.println("# Initials Count: " + initials.size());
|
||||
System.out.println("# Writing Overlaps");
|
||||
|
||||
// simpleList();
|
||||
fullCheck();
|
||||
}
|
||||
|
||||
public static void addString(String s, CEList currCEList) {
|
||||
back.put(s, currCEList);
|
||||
completes.put(currCEList, s);
|
||||
|
||||
for (int i = 1; i < currCEList.length(); ++i) {
|
||||
CEList start = currCEList.start(i);
|
||||
Set bag = (Set) initials.get(start);
|
||||
if (bag == null) {
|
||||
bag = new TreeSet();
|
||||
initials.put(start, bag);
|
||||
}
|
||||
bag.add(s);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void simpleList() {
|
||||
Iterator it = completes.keySet().iterator();
|
||||
int counter = 0;
|
||||
int foundCount = 0;
|
||||
|
||||
while (it.hasNext()) {
|
||||
Utility.dot(counter++);
|
||||
|
||||
// see if the ces for the current element are the start of something else
|
||||
CEList key = (CEList) it.next();
|
||||
String val = (String) completes.get(key);
|
||||
Set probe = (Set) initials.get(key);
|
||||
|
||||
if (probe != null) {
|
||||
Utility.fixDot();
|
||||
foundCount++;
|
||||
System.out.println("Possible Overlap: ");
|
||||
System.out.println(" " + ucd.getCodeAndName(val));
|
||||
System.out.println("\t" + key);
|
||||
|
||||
Iterator it2 = probe.iterator();
|
||||
int count2 = 0;
|
||||
while (it2.hasNext()) {
|
||||
String match = (String) it2.next();
|
||||
CEList ceList = (CEList) back.get(match);
|
||||
System.out.println((count2++) + ". " + ucd.getCodeAndName(match));
|
||||
System.out.println("\t" + ceList);
|
||||
}
|
||||
}
|
||||
}
|
||||
System.out.println("# Found Count: " + foundCount);
|
||||
}
|
||||
|
||||
static boolean PROGRESS = false;
|
||||
|
||||
static void fullCheck() throws IOException {
|
||||
PrintWriter log = Utility.openPrintWriter("Overlap.html");
|
||||
PrintWriter simpleList = Utility.openPrintWriter("Overlap.txt");
|
||||
|
||||
Iterator it = completes.keySet().iterator();
|
||||
int counter = 0;
|
||||
int foundCount = 0;
|
||||
|
||||
String [] goalChars = new String[1];
|
||||
String [] matchChars = new String[1];
|
||||
|
||||
// CEList show = getCEList("\u2034");
|
||||
log.println("<html><head>");
|
||||
log.println("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
log.println("<title>New Page 1</title>");
|
||||
log.println("<style><!--");
|
||||
log.println("table { border-style: solid; border-width: 1 }");
|
||||
log.println("td { border-style: solid; border-width: 1 }");
|
||||
log.println("--></style>");
|
||||
log.println("</head><body><table>");
|
||||
|
||||
|
||||
while (it.hasNext()) {
|
||||
Utility.dot(counter++);
|
||||
CEList key = (CEList) it.next();
|
||||
if (key.length() < 2) continue;
|
||||
|
||||
String val = (String) completes.get(key);
|
||||
goalChars[0] = "";
|
||||
matchChars[0] = "";
|
||||
if (matchWhole(val, key, 0, goalChars, matchChars)) {
|
||||
|
||||
simpleList.println(ucd.getCodeAndName(val));
|
||||
|
||||
goalChars[0] = val + goalChars[0]; // fix first char
|
||||
|
||||
if (!getCEList(goalChars[0]).equals(getCEList(matchChars[0]))) {
|
||||
log.println("<tr><td colspan='6'>WARNING:" + getCEList(matchChars[0]) + "</td></tr>");
|
||||
}
|
||||
foundCount++;
|
||||
log.println("<tr><td>" + val + "</td>");
|
||||
log.println("<td>" + goalChars[0] + "</td>");
|
||||
log.println("<td>" + matchChars[0] + "</td>");
|
||||
log.println("<td>" + ucd.getCodeAndName(goalChars[0]) + "</td>");
|
||||
log.println("<td>" + ucd.getCodeAndName(matchChars[0]) + "</td>");
|
||||
log.println("<td>" + getCEList(goalChars[0]) + "</td></tr>");
|
||||
//log.println("\t" + );
|
||||
}
|
||||
}
|
||||
log.println("</tr></table>Number of Overlapping characters: " + foundCount + "</body>");
|
||||
log.close();
|
||||
simpleList.close();
|
||||
}
|
||||
|
||||
static private CEList getCEList(String s) {
|
||||
int len = collator.getCEs(s, true, ces);
|
||||
return new CEList(ces, 0, len);
|
||||
}
|
||||
|
||||
static private CEList getCEList(int originalChar, String s, byte type) {
|
||||
int len = collator.getCEs(s, true, ces);
|
||||
for (int i = 0; i < len; ++i) {
|
||||
ces[i] = UCA.makeKey(UCA.getPrimary(ces[i]),
|
||||
UCA.getSecondary(ces[i]),
|
||||
CEList.remap(originalChar, type, UCA.getTertiary(ces[i])));
|
||||
}
|
||||
return new CEList(ces, 0, len);
|
||||
}
|
||||
|
||||
static boolean matchWhole(String goalStr, CEList goal, int depth, String[] goalChars, String[] otherChars) {
|
||||
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Trying: " + ucd.getCodeAndName(goalStr) + ", " + goal);
|
||||
|
||||
// to stop infinite loops, we limit the depth to 5
|
||||
if (depth > 5) {
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "stack exhausted");
|
||||
return false;
|
||||
}
|
||||
|
||||
String match;
|
||||
|
||||
// There are 3 possible conditions. Any of which work.
|
||||
|
||||
// To eliminate double matches at the top level, we test depth > 0
|
||||
|
||||
if (depth > 0) {
|
||||
|
||||
// Condition 1.
|
||||
// we have an exact match
|
||||
|
||||
match = (String) completes.get(goal);
|
||||
if (match != null) {
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Exactly: " + ucd.getCodeAndName(match));
|
||||
otherChars[0] = match + otherChars[0];
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth)
|
||||
+ ucd.getCode(goalChars[0])
|
||||
+ " / " + ucd.getCode(otherChars[0])
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Condition 2
|
||||
// this whole string matches some initial portion of another string
|
||||
// AND the remainder of that other string also does a matchWhole.
|
||||
// Example: if we get the following, we search for a match to "de"
|
||||
// abc...
|
||||
// abcde
|
||||
// If we find a match, we append to the strings, the string for abc
|
||||
// and the one for abcde
|
||||
|
||||
Set probe = (Set) initials.get(goal);
|
||||
if (probe != null) {
|
||||
Iterator it2 = probe.iterator();
|
||||
while (it2.hasNext()) {
|
||||
match = (String) it2.next();
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Longer: " + ucd.getCodeAndName(match)
|
||||
+ "\t\tswitching");
|
||||
CEList trail = ((CEList) back.get(match)).end(goal.length());
|
||||
boolean doesMatch = matchWhole(match, trail, depth+1, otherChars, goalChars);
|
||||
if (doesMatch) {
|
||||
otherChars[0] = match + otherChars[0];
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth)
|
||||
+ ucd.getCode(goalChars[0])
|
||||
+ " / " + ucd.getCode(otherChars[0])
|
||||
);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Condition 3
|
||||
// the first part of this string matches a whole other string
|
||||
// and the remainder of this string also does a matchWhole
|
||||
// Example: if we get the following, we search for a match to "de"
|
||||
// abcde..
|
||||
// abc..
|
||||
// if we find a match
|
||||
|
||||
for (int i = goal.length() - 1; i > 0; --i) {
|
||||
CEList first = goal.start(i);
|
||||
match = (String) completes.get(first);
|
||||
if (match != null) {
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Shorter: " + ucd.getCodeAndName(match));
|
||||
boolean doesMatch = matchWhole("", goal.end(i), depth+1, goalChars, otherChars);
|
||||
if (doesMatch) {
|
||||
otherChars[0] = match + otherChars[0];
|
||||
if (PROGRESS) System.out.println(Utility.repeat(". ", depth)
|
||||
+ ucd.getCode(goalChars[0])
|
||||
+ " / " + ucd.getCode(otherChars[0])
|
||||
);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if we get this far, we failed.
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public static void generateRevision (UCA collatorIn) throws Exception {
|
||||
generateRevision(collatorIn, false);
|
||||
generateRevision(collatorIn, true);
|
||||
}
|
||||
|
||||
public static void generateRevision (UCA collatorIn, boolean doMax) throws Exception {
|
||||
collator = collatorIn;
|
||||
|
||||
CEList.main(null);
|
||||
|
||||
System.out.println("# Generate");
|
||||
System.out.println("# Generated " + new Date());
|
||||
|
||||
ucd = UCD.make();
|
||||
|
||||
nfd = new Normalizer(Normalizer.NFD);
|
||||
nfkd = new Normalizer(Normalizer.NFKD);
|
||||
|
||||
UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
|
||||
|
||||
// store data for faster lookup
|
||||
|
||||
System.out.println("# Gathering Data");
|
||||
int counter = 0;
|
||||
|
||||
int[] lenArray = new int[1];
|
||||
|
||||
Set list = new TreeSet();
|
||||
Map newCollisions = new HashMap();
|
||||
Map oldCollisions = new HashMap();
|
||||
Map newProblems = new TreeMap();
|
||||
Map oldProblems = new TreeMap();
|
||||
|
||||
CEList nullCEList = new CEList(new int[1]);
|
||||
|
||||
while (true) {
|
||||
Utility.dot(counter++);
|
||||
String str = cc.next(ces, lenArray);
|
||||
if (str == null) break;
|
||||
int len = lenArray[0];
|
||||
|
||||
CEList oldList = new CEList(ces, 0, len);
|
||||
|
||||
CEList newList = new CEList(ces,0,0);
|
||||
int cp;
|
||||
for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(str, i);
|
||||
if (0xFF67 <= cp && cp <= 0xFF6F) {
|
||||
System.out.println("debug");
|
||||
}
|
||||
boolean mashLast = false;
|
||||
if (nfkd.normalizationDiffers(cp)) {
|
||||
String decomp = nfkd.normalize(cp);
|
||||
String canon = nfd.normalize(cp);
|
||||
len = collator.getCEs(decomp, true, ces);
|
||||
if (!decomp.equals(canon)) {
|
||||
byte type = ucd.getDecompositionType(cp);
|
||||
for (int j = 0; j < len; ++j) {
|
||||
int p = (i == 0 && decomp.length() > 1 && decomp.charAt(0) == ' ' ? 0x20A : UCA.getPrimary(ces[j]));
|
||||
int s = UCA.getSecondary(ces[j]);
|
||||
boolean needsFix = (s != 0x20 && p != 0);
|
||||
if (needsFix) ++len;
|
||||
int t = (doMax && len > 1 && j == len-1 ? 0x1F : CEList.remap(cp, type, UCA.getTertiary(ces[j])));
|
||||
if (needsFix) {
|
||||
ces[j++] = UCA.makeKey(p, 0x20, t); // Set Extra
|
||||
System.arraycopy(ces, j, ces, j+1, len - j); // Insert HOLE!
|
||||
p = 0;
|
||||
}
|
||||
ces[j] = UCA.makeKey(p, s, t);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
len = collator.getCEs(UTF16.valueOf(cp), true, ces);
|
||||
}
|
||||
CEList inc = new CEList(ces, 0, len);
|
||||
|
||||
if (cp == 0xFF71 || cp == 0xFF67) {
|
||||
System.out.println(" String: " + ucd.getCodeAndName(cp));
|
||||
System.out.println(" Type: " + ucd.getDecompositionTypeID(cp));
|
||||
System.out.println(" xxx: " + inc);
|
||||
}
|
||||
|
||||
newList = newList.append(inc);
|
||||
|
||||
}
|
||||
if (newList.length() == 0) newList = nullCEList;
|
||||
if (oldList.length() == 0) oldList = nullCEList;
|
||||
|
||||
if (!newList.equals(oldList)) {
|
||||
/*
|
||||
System.out.println("String: " + ucd.getCodeAndName(str));
|
||||
System.out.println("\tOld: " + oldList);
|
||||
System.out.println("\tNew: " + newList);
|
||||
*/
|
||||
list.add(new Pair(newList, new Pair(str, oldList)));
|
||||
}
|
||||
|
||||
// check for collisions
|
||||
if (str.equals("\u206F")) {
|
||||
System.out.println("debug");
|
||||
}
|
||||
Object probe = newCollisions.get(newList);
|
||||
if (probe == null) {
|
||||
newCollisions.put(newList, str);
|
||||
} else {
|
||||
newProblems.put(str, new Pair((String)probe, newList));
|
||||
}
|
||||
|
||||
probe = oldCollisions.get(oldList);
|
||||
if (probe == null) {
|
||||
oldCollisions.put(oldList, str);
|
||||
} else {
|
||||
oldProblems.put(str, new Pair((String)probe, oldList));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Set newKeys = new TreeSet(newProblems.keySet());
|
||||
Set oldKeys = new TreeSet(oldProblems.keySet());
|
||||
Set joint = new TreeSet(newKeys);
|
||||
joint.retainAll(oldKeys);
|
||||
newKeys.removeAll(joint);
|
||||
oldKeys.removeAll(joint);
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter("UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"));
|
||||
Iterator it = list.iterator();
|
||||
int last = -1;
|
||||
while (it.hasNext()) {
|
||||
Utility.dot(counter++);
|
||||
Pair value = (Pair) it.next();
|
||||
CEList newList = (CEList)value.first;
|
||||
int cur = UCA.getPrimary(newList.at(0));
|
||||
if (cur != last) {
|
||||
log.println();
|
||||
last = cur;
|
||||
}
|
||||
Pair v2 = (Pair) value.second;
|
||||
String ss = (String)v2.first;
|
||||
log.println(ucd.getCodeAndName(ss) + "\t\t" + ucd.getDecompositionTypeID(ss.charAt(0)));
|
||||
log.println("\tnew:\t" + value.first);
|
||||
log.println("\told:\t" + v2.second);
|
||||
}
|
||||
|
||||
/*
|
||||
log.println();
|
||||
log.println("New Collisions: " + newKeys.size());
|
||||
it = newKeys.iterator();
|
||||
while (it.hasNext()) {
|
||||
String key = (String) it.next();
|
||||
CEList cel = (CEList) newProblems.get(key);
|
||||
String other = (String) newCollisions.get(cel);
|
||||
log.println(ucd.getCodeAndName(key) + " collides with " + ucd.getCodeAndName(other));
|
||||
log.println("\t" + cel);
|
||||
}
|
||||
|
||||
log.println("Removed Collisions: " + oldKeys.size());
|
||||
it = oldKeys.iterator();
|
||||
while (it.hasNext()) {
|
||||
String key = (String) it.next();
|
||||
CEList cel = (CEList) oldProblems.get(key);
|
||||
String other = (String) oldCollisions.get(cel);
|
||||
log.println(ucd.getCodeAndName(key) + " collides with " + ucd.getCodeAndName(other));
|
||||
log.println("\t" + cel);
|
||||
}
|
||||
*/
|
||||
|
||||
showCollisions(log, "New Collisions:", newKeys, newProblems);
|
||||
showCollisions(log, "Old Collisions:", oldKeys, oldProblems);
|
||||
showCollisions(log, "In Both:", joint, oldProblems);
|
||||
log.close();
|
||||
}
|
||||
|
||||
static void showCollisions(PrintWriter log, String title, Set bad, Map probs) {
|
||||
log.println();
|
||||
log.println(title + bad.size());
|
||||
Iterator it = bad.iterator();
|
||||
Set lister = new TreeSet();
|
||||
|
||||
while (it.hasNext()) {
|
||||
String key = (String) it.next();
|
||||
Pair pair = (Pair) probs.get(key);
|
||||
String other = (String) pair.first;
|
||||
CEList cel = (CEList) pair.second;
|
||||
if (key.equals("\u0001")) {
|
||||
System.out.println("debug");
|
||||
}
|
||||
lister.add(new Pair(cel, ucd.getCodeAndName(key) + ",\t" + ucd.getCodeAndName(other)));
|
||||
}
|
||||
|
||||
it = lister.iterator();
|
||||
int last = -1;
|
||||
while (it.hasNext()) {
|
||||
Pair pair = (Pair) it.next();
|
||||
CEList cel = (CEList) pair.first;
|
||||
int curr = UCA.getPrimary(cel.at(0));
|
||||
if (curr != last) {
|
||||
last = curr;
|
||||
log.println();
|
||||
}
|
||||
log.println("Collision between: " + pair.second);
|
||||
log.println("\t" + pair.first);
|
||||
}
|
||||
log.flush();
|
||||
}
|
||||
}
|
54
tools/unicodetools/com/ibm/text/UCA/RuleComparator.java
Normal file
54
tools/unicodetools/com/ibm/text/UCA/RuleComparator.java
Normal file
|
@ -0,0 +1,54 @@
|
|||
package com.ibm.text.UCA;
|
||||
|
||||
import com.ibm.text.UCD.*;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public final class RuleComparator implements java.util.Comparator {
|
||||
|
||||
public int compare(Object s, Object t) {
|
||||
String ss = (String)s;
|
||||
String tt = (String)t;
|
||||
|
||||
// compare just the initial portions of each level, FIRST
|
||||
// only if there is a difference outside of the initial level do we stop
|
||||
// we assume that there are the same number of levels!!
|
||||
|
||||
int si = 0;
|
||||
int ti = 0;
|
||||
int result = 0;
|
||||
try {
|
||||
while (si < ss.length() && ti < tt.length()) {
|
||||
char cs = ss.charAt(si++);
|
||||
char ct = tt.charAt(ti++);
|
||||
|
||||
if (cs == ct) continue;
|
||||
/*
|
||||
if (cs == 0) {
|
||||
if (result == 0) result = -1;
|
||||
while (ct != 0 && ti < tt.length()) {
|
||||
ct = tt.charAt(ti++);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (ct == 0) {
|
||||
if (result == 0) result = 1;
|
||||
while (cs != 0 && si < ss.length()) {
|
||||
cs = ss.charAt(si++);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
*/
|
||||
if (cs < ct) return -1;
|
||||
return 1;
|
||||
}
|
||||
} catch (StringIndexOutOfBoundsException e) {
|
||||
System.out.println("WHOOPS: ");
|
||||
System.out.println(si + ", " + Utility.hex(ss));
|
||||
System.out.println(ti + ", " + Utility.hex(tt));
|
||||
}
|
||||
if (result != 0) return result;
|
||||
if (ss.length() > tt.length()) return 1;
|
||||
if (ss.length() < tt.length()) return -1;
|
||||
return 0;
|
||||
}
|
||||
}
|
1579
tools/unicodetools/com/ibm/text/UCA/UCA.java
Normal file
1579
tools/unicodetools/com/ibm/text/UCA/UCA.java
Normal file
File diff suppressed because it is too large
Load diff
3056
tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
Normal file
3056
tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
Normal file
File diff suppressed because it is too large
Load diff
2112
tools/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java
Normal file
2112
tools/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java
Normal file
File diff suppressed because it is too large
Load diff
538
tools/unicodetools/com/ibm/text/UCD/BuildNames.java
Normal file
538
tools/unicodetools/com/ibm/text/UCD/BuildNames.java
Normal file
|
@ -0,0 +1,538 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.IOException;
|
||||
//import com.ibm.text.unicode.UInfo;
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
//import java.text.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
|
||||
public class BuildNames implements UCD_Types {
|
||||
|
||||
static final boolean DEBUG = true;
|
||||
|
||||
static UCD ucd;
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
ucd = UCD.make();
|
||||
|
||||
collectWords();
|
||||
}
|
||||
|
||||
static Set words = new TreeSet(new LengthFirstComparator());
|
||||
static Set lines = new TreeSet(new LengthFirstComparator());
|
||||
static int[] letters = new int[128];
|
||||
|
||||
static void stash(String word) {
|
||||
words.add(word);
|
||||
for (int i = 0; i < word.length(); ++i) {
|
||||
letters[word.charAt(i)]++;
|
||||
}
|
||||
}
|
||||
|
||||
static String transform(String line) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
boolean changed = false;
|
||||
for (int i = 0; i < line.length(); ++i) {
|
||||
char c = line.charAt(i);
|
||||
|
||||
if (c == '-' || c == '<' || c == '>') {
|
||||
if (result.length() > 0 && result.charAt(result.length()-1) != ' ') result.append(' ');
|
||||
result.append(c);
|
||||
if (i + 1 < line.length() && line.charAt(i+1) != ' ') result.append(' ');
|
||||
changed = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ('a' <= c && c <= 'z') {
|
||||
result.append((char)(c - 'a' + 'A'));
|
||||
changed = true;
|
||||
continue;
|
||||
}
|
||||
if ('0' <= c && c <= '9') {
|
||||
result.append('*').append((char)(c - '0' + 'A'));
|
||||
changed = true;
|
||||
continue;
|
||||
}
|
||||
result.append(c);
|
||||
}
|
||||
if (!changed) return line;
|
||||
return result.toString().trim();
|
||||
}
|
||||
|
||||
static void collectWords() throws IOException {
|
||||
|
||||
System.out.println("Gathering data");
|
||||
//Counter counter = new Counter();
|
||||
String[] parts = new String[100];
|
||||
//int total = 0;
|
||||
int used = 0;
|
||||
int sum = 0;
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if (ucd.hasComputableName(i)) continue;
|
||||
String name = transform(ucd.getName(i));
|
||||
|
||||
|
||||
sum += name.length();
|
||||
used++;
|
||||
|
||||
// replace numbers & letters
|
||||
|
||||
int len = Utility.split(name, ' ', parts);
|
||||
for (int j = 0; j < len; ++j) {
|
||||
stash(parts[j]);
|
||||
}
|
||||
|
||||
lines.add(name);
|
||||
}
|
||||
System.out.println("Overhead: " + (lastLink - used) + ", " + ((lastLink - used) * 100 / used) + "%");
|
||||
System.out.println("Strings: " + sum + ", " + (lastLink*4));
|
||||
|
||||
System.out.println();
|
||||
System.out.println("Compacting Words");
|
||||
System.out.println();
|
||||
Iterator it = words.iterator();
|
||||
int i = 0;
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
int test = CompactName.addWord(s);
|
||||
String round = CompactName.stringFromToken(test);
|
||||
boolean goesRound = round.equals(s);
|
||||
if (false || !goesRound) System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")"
|
||||
+ (goesRound ? ": NO RT: '" + round + "'" : ""));
|
||||
}
|
||||
|
||||
System.out.println();
|
||||
System.out.println("Compacting Lines");
|
||||
System.out.println();
|
||||
CompactName.startLines();
|
||||
it = lines.iterator();
|
||||
i = 0;
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
if (s.equals("< BELL >")) {
|
||||
System.out.println("DEBUG");
|
||||
}
|
||||
int test = CompactName.addLine(s);
|
||||
String round = CompactName.stringFromToken(test);
|
||||
boolean goesRound = round.equals(s);
|
||||
if (false || !goesRound) System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")"
|
||||
+ (!goesRound ? ": NO RT: '" + round + "'" : ""));
|
||||
}
|
||||
|
||||
/*System.out.println("Printing Compact Forms");
|
||||
for (int i = 0; i < CompactName.lastToken; ++i) {
|
||||
String s = CompactName.stringFromToken(i);
|
||||
System.out.println(i + ": '" + s + "'");
|
||||
}*/
|
||||
|
||||
System.out.println("Strings: " + sum
|
||||
+ ", " + (CompactName.spacedMinimum*4)
|
||||
+ ", " + (CompactName.lastToken*4)
|
||||
);
|
||||
|
||||
}
|
||||
/*
|
||||
Set stuff = new TreeSet();
|
||||
for (int i = 0; i < letters.length; ++i) {
|
||||
if (letters[i] != 0) {
|
||||
stuff.add(new Integer((letters[i] << 8) + i));
|
||||
}
|
||||
}
|
||||
|
||||
it = stuff.iterator();
|
||||
while (it.hasNext()) {
|
||||
int in = ((Integer) it.next()).intValue();
|
||||
System.out.println((char)(in & 0xFF) + ":\t" + String.valueOf(in >> 8));
|
||||
}
|
||||
int r = addString(name);
|
||||
if (!DEBUG && !rname.equals(name)) {
|
||||
System.out.println("\tNo Round Trip: '" + rname + "'");
|
||||
}
|
||||
*/
|
||||
|
||||
static Map stringToInt = new HashMap();
|
||||
static Map intToString = new HashMap();
|
||||
|
||||
static final int[] remap = new int['Z'+1];
|
||||
static final int maxToken;
|
||||
|
||||
static {
|
||||
int counter = 1;
|
||||
remap[' '] = counter++;
|
||||
remap['-'] = counter++;
|
||||
remap['>'] = counter++;
|
||||
remap['<'] = counter++;
|
||||
for (int i = 'A'; i <= 'Z'; ++i) {
|
||||
remap[i] = counter++;
|
||||
}
|
||||
for (int i = '0'; i <= '9'; ++i) {
|
||||
remap[i] = counter++;
|
||||
}
|
||||
maxToken = counter;
|
||||
}
|
||||
|
||||
static final String[] unmap = new String[maxToken];
|
||||
static {
|
||||
unmap[0] = "";
|
||||
for (int i = 0; i < remap.length; ++i) {
|
||||
int x = remap[i];
|
||||
if (x != 0) unmap[x] = String.valueOf((char)i);
|
||||
}
|
||||
}
|
||||
|
||||
static int[] links = new int[40000];
|
||||
static final int linkStart = 0;
|
||||
static int lastLink = 0;
|
||||
static final int LITERAL_BOUND = 0x7FFF - maxToken * maxToken;
|
||||
|
||||
static boolean isLiteral(int i) {
|
||||
return (i & 0x7FFF) > LITERAL_BOUND;
|
||||
}
|
||||
|
||||
static String lookup(int i) {
|
||||
String result;
|
||||
boolean trailingSpace = false;
|
||||
if ((i & 0x8000) != 0) {
|
||||
i ^= 0x8000;
|
||||
trailingSpace = true;
|
||||
}
|
||||
if (i > LITERAL_BOUND) {
|
||||
i = i - LITERAL_BOUND;
|
||||
int first = i / maxToken;
|
||||
int second = i % maxToken;
|
||||
result = unmap[first] + unmap[second];
|
||||
} else {
|
||||
int value = links[i];
|
||||
int lead = value >>> 16;
|
||||
int trail = value & 0xFFFF;
|
||||
//if (DEBUG) System.out.println("lead: " + lead + ", trail: " + trail);
|
||||
result = lookup(lead) + lookup(trail);
|
||||
}
|
||||
if (trailingSpace) result += ' ';
|
||||
if (DEBUG) System.out.println("token: " + i + " => '" + result + "'");
|
||||
return result;
|
||||
}
|
||||
|
||||
static int getInt(String s) {
|
||||
if (s.length() < 3) {
|
||||
if (s.length() == 0) return 0;
|
||||
int first = s.charAt(0);
|
||||
int second = s.length() > 1 ? s.charAt(1) : 0;
|
||||
return LITERAL_BOUND + (remap[first] * maxToken + remap[second]);
|
||||
}
|
||||
Object in = stringToInt.get(s);
|
||||
if (in == null) return -1;
|
||||
return ((Integer)in).intValue();
|
||||
}
|
||||
|
||||
static int putString(String s, int lead, int trail) {
|
||||
Object in = stringToInt.get(s);
|
||||
if (in != null) throw new IllegalArgumentException();
|
||||
int value = (lead << 16) + (trail & 0xFFFF);
|
||||
int result = lastLink;
|
||||
links[lastLink++] = value;
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("'" + s + "', link[" + result + "] = lead: " + lead + ", trail: " + trail);
|
||||
String roundTrip = lookup(result);
|
||||
if (!roundTrip.equals(s)) {
|
||||
System.out.println("\t*** No Round Trip: '" + roundTrip + "'");
|
||||
}
|
||||
}
|
||||
stringToInt.put(s, new Integer(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
// s cannot have a trailing space. Must be <,>,-,SPACE,0-9,A-Z
|
||||
static int addString(String s) {
|
||||
int result = getInt(s);
|
||||
if (result != -1) return result;
|
||||
int limit = s.length() - 1;
|
||||
int bestLen = 0;
|
||||
int best_i = 0;
|
||||
int bestSpaceLen = 0;
|
||||
int bestSpace_i = 0;
|
||||
int lastSpace = -1;
|
||||
int spaceBits;
|
||||
int endOfFirst;
|
||||
|
||||
// invariant. We break after a space if there is one.
|
||||
|
||||
for (int i = 1; i < limit; ++i) {
|
||||
char c = s.charAt(i-1);
|
||||
spaceBits = 0;
|
||||
endOfFirst = i;
|
||||
if (c == ' ') {
|
||||
lastSpace = i;
|
||||
endOfFirst--;
|
||||
spaceBits = 0x8000;
|
||||
}
|
||||
|
||||
String firstPart = s.substring(0, endOfFirst);
|
||||
String lastPart = s.substring(i);
|
||||
if (firstPart.equals("<START OF ")) {
|
||||
System.out.println("HUH");
|
||||
}
|
||||
int lead = getInt(firstPart);
|
||||
int trail = getInt(lastPart);
|
||||
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
|
||||
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
|
||||
+ "' # '" + lastPart + "' MATCH BOTH");
|
||||
return putString(s, spaceBits | lead, trail);
|
||||
}
|
||||
if (!isLiteral(lead)) {
|
||||
if (i > bestLen) {
|
||||
bestLen = i;
|
||||
best_i = i;
|
||||
}
|
||||
if (i > bestSpaceLen && c == ' ') {
|
||||
bestSpaceLen = i;
|
||||
bestSpace_i = i + 1;
|
||||
}
|
||||
}
|
||||
int end_i = s.length() - i;
|
||||
if (!isLiteral(trail)) {
|
||||
if (end_i > bestLen) {
|
||||
bestLen = end_i;
|
||||
best_i = i;
|
||||
}
|
||||
if (end_i > bestSpaceLen && c == ' ') {
|
||||
bestSpaceLen = end_i;
|
||||
bestSpace_i = i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (lastSpace >= 0) {
|
||||
bestLen = bestSpaceLen;
|
||||
best_i = bestSpace_i;
|
||||
}
|
||||
|
||||
spaceBits = 0;
|
||||
|
||||
if (bestLen > 0) { // if one matches, recurse -- and return pair
|
||||
endOfFirst = best_i;
|
||||
if (lastSpace > 0) {
|
||||
--endOfFirst;
|
||||
spaceBits = 0x8000;
|
||||
}
|
||||
String firstPart = s.substring(0, endOfFirst);
|
||||
String lastPart = s.substring(best_i);
|
||||
int lead = getInt(firstPart);
|
||||
int trail = getInt(lastPart);
|
||||
if (lead >= 0) {
|
||||
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
|
||||
+ "' # '" + lastPart + "' MATCH FIRST");
|
||||
return putString(s, spaceBits | lead, addString(lastPart));
|
||||
} else {
|
||||
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
|
||||
+ "' # '" + lastPart + "' MATCH SECOND");
|
||||
return putString(s, spaceBits | addString(firstPart), trail);
|
||||
}
|
||||
}
|
||||
// otherwise, we failed to find anything. Then break before the last word, if there is one
|
||||
// otherwise break in the middle (but at even value)
|
||||
|
||||
|
||||
if (lastSpace >= 0) {
|
||||
best_i = lastSpace;
|
||||
endOfFirst = lastSpace - 1;
|
||||
spaceBits = 0x8000;
|
||||
} else {
|
||||
endOfFirst = best_i = ((s.length() + 1) / 4) * 2;
|
||||
}
|
||||
String firstPart = s.substring(0, endOfFirst);
|
||||
String lastPart = s.substring(best_i);
|
||||
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
|
||||
+ "' # '" + lastPart + "' FALLBACK");
|
||||
return putString(s, spaceBits | addString(firstPart), addString(lastPart));
|
||||
}
|
||||
|
||||
/*
|
||||
static int addCompression(String s) {
|
||||
Object in = stringToInt.get(s);
|
||||
if (in != null) return ((Integer) in).intValue();
|
||||
// find best match, recursively
|
||||
int bestBreak = -1;
|
||||
boolean pickFirst = false;
|
||||
for (int i = 1; i < s.length() - 1; ++i) {
|
||||
char c = s.charAt(i);
|
||||
if (c == ' ' || c == '-') {
|
||||
Object pos1 = stringToInt.get(s.substring(0,i+1));
|
||||
//Object pos23 = stringToInt.get(s..substring(i));
|
||||
|
||||
|
||||
if (pos2 >= 0 && pos3 >= 0) {
|
||||
fullToCompressed.put(value, new Integer(index + reserved));
|
||||
continue main;
|
||||
}
|
||||
if (pos2 >= 0) {
|
||||
if (k > bestBreak) {
|
||||
bestBreak = k;
|
||||
pickFirst = true;
|
||||
}
|
||||
} else if (pos3 >= 0) {
|
||||
if (value.length() - k > bestBreak) {
|
||||
bestBreak = k;
|
||||
pickFirst = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void gatherData() throws IOException {
|
||||
System.out.println("Gathering data");
|
||||
Counter counter = new Counter();
|
||||
String[] parts = new String[100];
|
||||
String[] parts2 = new String[100];
|
||||
int total = 0;
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
//if ((i & 0xFF) == 0) System.out.println(Utility.hex(i));
|
||||
if (!ucd.isRepresented(i)) continue;
|
||||
String s = ucd.getName(i);
|
||||
total += s.length();
|
||||
int len = Utility.split(s, ' ', parts);
|
||||
for (int j = 0; j < len; ++j) {
|
||||
if (parts[j].indexOf('-') >= 0) {
|
||||
// hyphen stuff
|
||||
int len2 = Utility.split(parts[j], '-', parts2);
|
||||
for (int k = 0; k < len2; ++k) {
|
||||
if (k == len2 - 1) {
|
||||
counter.add(parts2[k] + '-');
|
||||
} else {
|
||||
counter.add(parts2[k] + " ");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// normal
|
||||
counter.add(parts[j] + " ");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Sorting data");
|
||||
Map m = counter.extract();
|
||||
|
||||
System.out.println("Printing data");
|
||||
|
||||
PrintWriter log = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(GEN_DIR + "NameCompression.txt")),
|
||||
32*1024));
|
||||
|
||||
log.println("total: " + total);
|
||||
|
||||
Iterator it = m.keySet().iterator();
|
||||
|
||||
String mondo = "";
|
||||
int i = 0;
|
||||
int strTotal = 0;
|
||||
|
||||
int index = 0;
|
||||
Map fullToCompressed = new HashMap();
|
||||
|
||||
String mondoIndex = "";
|
||||
|
||||
main:
|
||||
while (it.hasNext()) {
|
||||
index++;
|
||||
if ((i & 255) == 0) System.out.println("#" + i);
|
||||
Counter.RWInteger key = (Counter.RWInteger) it.next();
|
||||
String value = (String)m.get(key);
|
||||
log.println(i++ + ": " + key + ": \"" + value + "\"");
|
||||
strTotal += value.length();
|
||||
|
||||
|
||||
// first 128 are the highest frequency, inc. space
|
||||
|
||||
if (index < 128 - SINGLES) {
|
||||
mondo += value;
|
||||
fullToCompressed.put(value, new String((char)(index + reserved)));
|
||||
continue;
|
||||
}
|
||||
|
||||
int pos = mondo.indexOf(value);
|
||||
if (pos >= 0) {
|
||||
// try splitting!
|
||||
|
||||
int bestBreak = -1;
|
||||
boolean pickFirst = false;
|
||||
if (value.length() > 2) for (int k = 1; k < value.length()-1; ++k) {
|
||||
int pos2 = mondo.indexOf(value.substring(0,k) + " ");
|
||||
int pos3 = mondo.indexOf(value.substring(k));
|
||||
if (pos2 >= 0 && pos3 >= 0) {
|
||||
fullToCompressed.put(value, new Integer(index + reserved));
|
||||
continue main;
|
||||
}
|
||||
if (pos2 >= 0) {
|
||||
if (k > bestBreak) {
|
||||
bestBreak = k;
|
||||
pickFirst = true;
|
||||
}
|
||||
} else if (pos3 >= 0) {
|
||||
if (value.length() - k > bestBreak) {
|
||||
bestBreak = k;
|
||||
pickFirst = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bestBreak > 0) {
|
||||
if (pickFirst) {
|
||||
mondo += value.substring(bestBreak);
|
||||
} else {
|
||||
mondo += value.substring(0, bestBreak) + " ";
|
||||
}
|
||||
} else {
|
||||
mondo += value;
|
||||
}
|
||||
}
|
||||
|
||||
// high bit on, means 2 bytes, look in array
|
||||
}
|
||||
|
||||
log.println("strTotal: " + strTotal);
|
||||
log.println("mondo: " + mondo.length());
|
||||
|
||||
int k = 80;
|
||||
for (; k < mondo.length(); k += 80) {
|
||||
log.println(mondo.substring(k-80, k));
|
||||
}
|
||||
log.println(mondo.substring(k-80)); // last line
|
||||
|
||||
log.close();
|
||||
}
|
||||
|
||||
static int indexOf(StringBuffer target, String source) {
|
||||
int targetLen = target.length() - source.length();
|
||||
main:
|
||||
for (int i = 0; i <= targetLen; ++i) {
|
||||
for (int j = 0; j < source.length(); ++j) {
|
||||
if (target.charAt(i) != source.charAt(j)) continue main;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static final int SINGLES = 26 + 10 + 2;
|
||||
*/
|
||||
|
||||
/*
|
||||
static String decode(int x) {
|
||||
if (x < SINGLES) {
|
||||
if (x < 26) return String.valueOf(x + 'A');
|
||||
if (x < 36) return String.valueOf(x - 26 + '0');
|
||||
if (x == 36) return "-";
|
||||
return " ";
|
||||
}
|
||||
if (x < binaryLimit) {
|
||||
x =
|
||||
*/
|
||||
}
|
260
tools/unicodetools/com/ibm/text/UCD/CompactName.java
Normal file
260
tools/unicodetools/com/ibm/text/UCD/CompactName.java
Normal file
|
@ -0,0 +1,260 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
import java.text.*;
|
||||
|
||||
public class CompactName {
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
int test = tokenFromString("ABZ");
|
||||
String ss = stringFromToken(test);
|
||||
System.out.println(ss);
|
||||
|
||||
CompactName.addWord("ABSOLUTEISM");
|
||||
|
||||
for (int i = 0; i < CompactName.lastToken; ++i) {
|
||||
String s = CompactName.stringFromToken(i);
|
||||
System.out.println(s);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
static final char[] compactMap = new char[128];
|
||||
static final char[] compactUnmap = new char[128];
|
||||
|
||||
static {
|
||||
char counter = 0;
|
||||
compactMap[0] = counter++;
|
||||
for (int i = 'A'; i <= 'Z'; ++i) {
|
||||
compactMap[i] = counter++;
|
||||
}
|
||||
compactMap['-'] = counter++;
|
||||
compactMap['>'] = counter++;
|
||||
compactMap['<'] = counter++;
|
||||
compactMap['*'] = counter++;
|
||||
|
||||
compactUnmap[0] = 0;
|
||||
for (char i = 0; i < compactUnmap.length; ++i) {
|
||||
int x = compactMap[i];
|
||||
if (x != 0) compactUnmap[x] = i;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
static String expand(String s) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
int m = s.charAt(i);
|
||||
if (m == 31 && i < s.length() + 1) {
|
||||
m = 31 + s.charAt(++i);
|
||||
}
|
||||
result.append(compactUnmap[m]);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
static String compact(String s) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
int m = compactMap[s.charAt(i)];
|
||||
if (m >= 31) {
|
||||
result.append((char)31);
|
||||
m -= 31;
|
||||
}
|
||||
result.append(m);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
*/
|
||||
|
||||
static Map string_token = new HashMap();
|
||||
static Map token_string = new HashMap();
|
||||
|
||||
static int[] tokenList = new int[40000];
|
||||
static final int tokenStart = 0;
|
||||
static int lastToken = 0;
|
||||
|
||||
static int spacedMinimum = Integer.MAX_VALUE;
|
||||
|
||||
static boolean isLiteral(int i) {
|
||||
return (i & 0x8000) != 0;
|
||||
}
|
||||
|
||||
static int addTokenForString(String s, int lead, int trail) {
|
||||
Object in = string_token.get(s);
|
||||
if (in != null) throw new IllegalArgumentException();
|
||||
int value = (lead << 16) + (trail & 0xFFFF);
|
||||
int result = lastToken;
|
||||
tokenList[lastToken++] = value;
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("'" + s + "', tokenList[" + result + "] = lead: " + lead + ", trail: " + trail);
|
||||
String roundTrip = stringFromToken(result);
|
||||
if (!roundTrip.equals(s)) {
|
||||
System.out.println("\t*** No Round Trip: '" + roundTrip + "'");
|
||||
}
|
||||
}
|
||||
string_token.put(s, new Integer(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
static String stringFromToken(int i) {
|
||||
String result;
|
||||
if ((i & 0x8000) != 0) {
|
||||
char first = compactUnmap[(i >> 10) & 0x1F];
|
||||
char second = compactUnmap[(i >> 5) & 0x1F];
|
||||
char third = compactUnmap[i & 0x1F];
|
||||
result = String.valueOf(first);
|
||||
if (second != 0) result += String.valueOf(second);
|
||||
if (third != 0) result += String.valueOf(third);
|
||||
} else if (i > lastToken) {
|
||||
throw new IllegalArgumentException("bad token: " + i);
|
||||
} else {
|
||||
int value = tokenList[i];
|
||||
int lead = value >>> 16;
|
||||
int trail = value & 0xFFFF;
|
||||
if (i >= spacedMinimum) result = stringFromToken(lead) + ' ' + stringFromToken(trail);
|
||||
else result = stringFromToken(lead) + stringFromToken(trail);
|
||||
}
|
||||
if (DEBUG) System.out.println("token: " + i + " => '" + result + "'");
|
||||
return result;
|
||||
}
|
||||
|
||||
static int tokenFromString(String s) {
|
||||
if (s.length() <= 3) {
|
||||
int first = compactMap[s.charAt(0)];
|
||||
int second = compactMap[s.length() > 1 ? s.charAt(1) : 0];
|
||||
int third = compactMap[s.length() > 2 ? s.charAt(2) : 0];
|
||||
return 0x8000 + (first << 10) + (second << 5) + third;
|
||||
}
|
||||
Object in = string_token.get(s);
|
||||
if (in == null) return -1;
|
||||
return ((Integer)in).intValue();
|
||||
}
|
||||
|
||||
|
||||
static int addWord(String s) {
|
||||
|
||||
int result = tokenFromString(s);
|
||||
if (result != -1) return result;
|
||||
int bestLen = 0;
|
||||
int best_i = 0;
|
||||
|
||||
int limit = s.length() - 1;
|
||||
|
||||
for (int i = limit; i >= 1; --i) {
|
||||
|
||||
String firstPart = s.substring(0, i);
|
||||
String lastPart = s.substring(i);
|
||||
|
||||
int lead = tokenFromString(firstPart);
|
||||
int trail = tokenFromString(lastPart);
|
||||
|
||||
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
|
||||
if (DEBUG) show(s, firstPart, lastPart, "MATCH BOTH");
|
||||
return addTokenForString(s, lead, trail);
|
||||
}
|
||||
if (!isLiteral(lead)) {
|
||||
if (i > bestLen) {
|
||||
bestLen = i;
|
||||
best_i = i;
|
||||
}
|
||||
}
|
||||
if (!isLiteral(trail)) {
|
||||
int end_i = s.length() - i;
|
||||
if (end_i > bestLen) {
|
||||
bestLen = end_i;
|
||||
best_i = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bestLen > 0) { // if one matches, recurse -- and return pair
|
||||
String firstPart = s.substring(0, best_i);
|
||||
String lastPart = s.substring(best_i);
|
||||
int lead = tokenFromString(firstPart);
|
||||
int trail = tokenFromString(lastPart);
|
||||
if (lead >= 0) {
|
||||
if (DEBUG) show(s, firstPart, lastPart, "MATCH FIRST");
|
||||
return addTokenForString(s, lead, addWord(lastPart));
|
||||
} else {
|
||||
if (DEBUG) show(s, firstPart, lastPart, "MATCH SECOND");
|
||||
return addTokenForString(s, addWord(firstPart), trail);
|
||||
}
|
||||
}
|
||||
|
||||
// break at multiple of 3
|
||||
|
||||
best_i = ((s.length() + 1) / 6) * 3;
|
||||
String firstPart = s.substring(0, best_i);
|
||||
String lastPart = s.substring(best_i);
|
||||
if (DEBUG) show(s, firstPart, lastPart, "Fallback");
|
||||
return addTokenForString(s, addWord(firstPart), addWord(lastPart));
|
||||
}
|
||||
|
||||
static void show(String s, String firstPart, String lastPart, String comment) {
|
||||
System.out.println((s) + " => '" + (firstPart)
|
||||
+ "' # '" + (lastPart) + "' " + comment);
|
||||
}
|
||||
|
||||
static void startLines() {
|
||||
spacedMinimum = lastToken;
|
||||
}
|
||||
|
||||
static int addLine(String s) {
|
||||
|
||||
int result = tokenFromString(s);
|
||||
if (result != -1) return result;
|
||||
int bestLen = 0;
|
||||
int best_i = 0;
|
||||
|
||||
int limit = s.length() - 2;
|
||||
|
||||
for (int i = limit; i >= 1; --i) {
|
||||
char c = s.charAt(i);
|
||||
if (c != ' ') continue;
|
||||
|
||||
String firstPart = s.substring(0, i);
|
||||
String lastPart = s.substring(i+1);
|
||||
|
||||
int lead = tokenFromString(firstPart);
|
||||
int trail = tokenFromString(lastPart);
|
||||
|
||||
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
|
||||
if (DEBUG) show(s, firstPart, lastPart, "MATCH BOTH");
|
||||
return addTokenForString(s, lead, trail);
|
||||
}
|
||||
if (i > bestLen) {
|
||||
bestLen = i;
|
||||
best_i = i;
|
||||
}
|
||||
|
||||
int end_i = s.length() - i - 1;
|
||||
if (end_i > bestLen) {
|
||||
bestLen = end_i;
|
||||
best_i = i;
|
||||
}
|
||||
}
|
||||
if (bestLen > 0) { // if one matches, recurse -- and return pair
|
||||
String firstPart = s.substring(0, best_i);
|
||||
String lastPart = s.substring(best_i + 1);
|
||||
int lead = tokenFromString(firstPart);
|
||||
int trail = tokenFromString(lastPart);
|
||||
if (lead >= 0) {
|
||||
if (DEBUG) show(s, firstPart, lastPart, "MATCH FIRST");
|
||||
return addTokenForString(s, lead, addLine(lastPart));
|
||||
} else {
|
||||
if (DEBUG) show(s, firstPart, lastPart, "MATCH SECOND");
|
||||
return addTokenForString(s, addLine(firstPart), trail);
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("SHOULD HAVE MATCHED!!");
|
||||
throw new IllegalArgumentException("SHOULD HAVE MATCHED!! " + s);
|
||||
}
|
||||
}
|
831
tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java
Normal file
831
tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java
Normal file
|
@ -0,0 +1,831 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.text.NumberFormat;
|
||||
import java.io.*;
|
||||
|
||||
|
||||
/** Simple program to merge UCD files into XML. Not yet documented!!
|
||||
* @author Mark Davis
|
||||
*/
|
||||
|
||||
public final class ConvertUCD implements UCD_Types {
|
||||
public static final boolean SHOW = true;
|
||||
public static final boolean DEBUG = false;
|
||||
|
||||
public static int major;
|
||||
public static int minor;
|
||||
public static int update;
|
||||
|
||||
static String version;
|
||||
|
||||
// varies by version
|
||||
/*
|
||||
public static final String BASE_DIR11 = DATA_DIR + "\\Versions\\";
|
||||
public static final String BASE_DIR20 = DATA_DIR + "\\Versions\\";
|
||||
public static final String BASE_DIR21 = DATA_DIR + "\\Versions\\";
|
||||
public static final String BASE_DIR30 = DATA_DIR + "\\Update 3.0.1\\";
|
||||
public static final String BASE_DIR31 = DATA_DIR + "\\3.1-Update\\";
|
||||
*/
|
||||
|
||||
//public static final String blocksnamePlain = "Blocks.txt";
|
||||
//public static final String blocksname31 = "Blocks-4d2.beta";
|
||||
|
||||
/** First item is file name, rest are field names (skipping character).
|
||||
* "OMIT" is special -- means don't record
|
||||
*/
|
||||
|
||||
static String[][] labelList = {
|
||||
// Labels for the incoming files. Labels MUST match field order in file.
|
||||
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
|
||||
// The one exception is "st", which is handled specially.
|
||||
// So file order is important.
|
||||
//*
|
||||
// 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; <compat> 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB
|
||||
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
|
||||
{"UnicodeData", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
{"ExtraProperties", "xp"},
|
||||
{"PropList", "binary"},
|
||||
|
||||
//{"ExtraProperties", "xp"},
|
||||
|
||||
{"EastAsianWidth", "ea", "OMIT"},
|
||||
{"LineBreak", "lb", "OMIT"},
|
||||
{"SpecialCasing", "*sl", "*st", "*su", "sc"},
|
||||
{"CompositionExclusions", "ce"},
|
||||
{"CaseFolding", "OMIT", "*fc"},
|
||||
{"ArabicShaping", "OMIT", "jt", "jg"},
|
||||
{"BidiMirroring", "*bg"},
|
||||
{"Scripts", "sn"},
|
||||
//{"Jamo", "jn"},
|
||||
//{"Scripts-1d4", "RANGE", "sn"},
|
||||
//{"Age", "*sn"},
|
||||
//*/
|
||||
/*
|
||||
//*/
|
||||
};
|
||||
/*
|
||||
static String[][] labelList31 = {
|
||||
// Labels for the incoming files. Labels MUST match field order in file.
|
||||
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
|
||||
// The one exception is "st", which is handled specially.
|
||||
// So file order is important.
|
||||
//*
|
||||
// 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; <compat> 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB
|
||||
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
|
||||
{"UnicodeData-3.1.0d8.beta", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
{"PropList-3.1.0d5.beta", "binary"},
|
||||
|
||||
{"ExtraProperties", "xp"},
|
||||
|
||||
{"EastAsianWidth-4d7.beta", "ea", "OMIT"},
|
||||
{"LineBreak-6d6.beta", "lb", "OMIT"},
|
||||
{"SpecialCasing-4d1.beta", "*sl", "*st", "*su", "sc"},
|
||||
{"CompositionExclusions-3d6.beta", "ce"},
|
||||
{"CaseFolding-3d4.beta", "OMIT", "*fc"},
|
||||
{"ArabicShaping", "OMIT", "jt", "jg"},
|
||||
{"BidiMirroring", "*bg"},
|
||||
{"Scripts-3.1.0d4.beta", "sn"},
|
||||
//{"Scripts-1d4", "RANGE", "sn"},
|
||||
//{"Age", "*sn"},
|
||||
//*/
|
||||
/*
|
||||
{"Jamo", "jn"},
|
||||
//
|
||||
};
|
||||
/*
|
||||
{"UnicodeData-3.1.0d8.beta", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
{"ExtraProperties", "xp"},
|
||||
|
||||
{"EastAsianWidth-4d7.beta", "ea", "OMIT"},
|
||||
{"LineBreak-6d6.beta", "lb", "OMIT"},
|
||||
{"SpecialCasing-4d1.beta", "*sl", "*st", "*su", "sc"},
|
||||
{"CompositionExclusions-3d6.beta", "ce"},
|
||||
{"CaseFolding-3d4.beta", "OMIT", "*fc"},
|
||||
{"PropList-3.1.0d2.beta", "PROP", "OMIT"},
|
||||
{"ArabicShaping", "OMIT", "jt", "jg"},
|
||||
{"BidiMirroring", "*bg"},
|
||||
{"Scripts-1d4", "sn"},
|
||||
//{"Scripts-1d4", "RANGE", "sn"},
|
||||
//{"Age", "*sn"},
|
||||
//*/
|
||||
/*
|
||||
{"Jamo", "jn"},
|
||||
//
|
||||
|
||||
//"NamesList-3.1.0d1.beta"
|
||||
|
||||
static String[][] labelList30 = {
|
||||
// Labels for the incoming files. Labels MUST match field order in file.
|
||||
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
|
||||
// The one exception is "st", which is handled specially.
|
||||
// So file order is important.
|
||||
//*
|
||||
{"UnicodeData", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
{"CompositionExclusions", "ce"},
|
||||
{"EastAsianWidth", "ea", "OMIT"},
|
||||
{"LineBreak", "lb", "OMIT"},
|
||||
{"SpecialCasing", "*sl", "*st", "*su", "sc"},
|
||||
{"CaseFolding", "OMIT", "*fc"},
|
||||
{"ArabicShaping", "OMIT", "jt", "jg"},
|
||||
{"BidiMirroring", "*bg"},
|
||||
/*
|
||||
{"Jamo", "jn"},
|
||||
{"PropList.alpha", "RANGE", "OMIT"},
|
||||
//
|
||||
};
|
||||
|
||||
static String[][] labelList11 = {
|
||||
{"UnicodeData-1.1", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
};
|
||||
|
||||
static String[][] labelList20 = {
|
||||
{"UnicodeData-2.0", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
};
|
||||
|
||||
static String[][] labelList21 = {
|
||||
{"UnicodeData-2.1", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
};
|
||||
*/
|
||||
|
||||
// handles
|
||||
public static final String blocksname = "Blocks";
|
||||
//public static final String[][] labelList;
|
||||
public static final boolean NEWPROPS = true;
|
||||
|
||||
/*
|
||||
static {
|
||||
switch (major*10 + minor) {
|
||||
case 31:
|
||||
blocksname = blocksname31;
|
||||
labelList = labelList31;
|
||||
break;
|
||||
case 30:
|
||||
blocksname = blocksnamePlain;
|
||||
labelList = labelList30;
|
||||
break;
|
||||
case 21:
|
||||
blocksname = blocksnamePlain;
|
||||
labelList = labelList21;
|
||||
break;
|
||||
case 20:
|
||||
blocksname = blocksnamePlain;
|
||||
labelList = labelList20;
|
||||
break;
|
||||
default:
|
||||
blocksname = blocksnamePlain;
|
||||
labelList = labelList11;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
static final String dataFilePrefix = "UCD_Data";
|
||||
|
||||
|
||||
// MAIN!!
|
||||
|
||||
public static void main (String[] args) throws Exception {
|
||||
System.out.println("ConvertUCD");
|
||||
|
||||
log = new PrintWriter(new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(GEN_DIR + "UCD-log.txt"),
|
||||
"UTF8"),
|
||||
32*1024));
|
||||
log.write("\uFEFF"); // BOM
|
||||
|
||||
try {
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
version = args[i];
|
||||
if (version.length() == 0) version = UCD.latestVersion;
|
||||
String[] parts = new String[3];
|
||||
Utility.split(version, '.', parts);
|
||||
major = Integer.parseInt(parts[0]);
|
||||
minor = Integer.parseInt(parts[1]);
|
||||
update = Integer.parseInt(parts[2]);
|
||||
|
||||
toJava();
|
||||
}
|
||||
} finally {
|
||||
log.close();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
static void toXML() throws Exception {
|
||||
// Blocks is special
|
||||
// Unihan is special
|
||||
// collect all the other .txt files in the directory
|
||||
if (false) readBlocks();
|
||||
if (true) for (int i = 0; i < labelList.length; ++i) {
|
||||
readSemi(labelList[i]);
|
||||
} else {
|
||||
readSemi(labelList[0]); // TESTING ONLY
|
||||
}
|
||||
writeXML();
|
||||
}
|
||||
*/
|
||||
|
||||
static void toJava() throws Exception {
|
||||
// Blocks is special
|
||||
// Unihan is special
|
||||
// collect all the other .txt files in the directory
|
||||
if (false) readBlocks();
|
||||
if (true) for (int i = 0; i < labelList.length; ++i) {
|
||||
readSemi(labelList[i]);
|
||||
} else {
|
||||
readSemi(labelList[0]); // TESTING ONLY
|
||||
}
|
||||
|
||||
Iterator it = charData.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Object key = it.next();
|
||||
UData value = (UData) charData.get(key);
|
||||
value.compact();
|
||||
}
|
||||
UData ud = getEntry(0x2A6D6);
|
||||
System.out.println("SPOT-CHECK: 2A6D6: " + ud);
|
||||
ud = getEntry(0xFFFF);
|
||||
System.out.println("SPOT-CHECK: FFFF: " + ud);
|
||||
|
||||
writeJavaData();
|
||||
}
|
||||
|
||||
static PrintWriter log;
|
||||
//static String directory = BASE_DIR;
|
||||
//static Map appendDuplicates = new HashMap();
|
||||
|
||||
/** First item in labels is file name, rest are field names (skipping character).
|
||||
* "OMIT" is special -- means don't record
|
||||
*/
|
||||
|
||||
static HashMap isHex = new HashMap();
|
||||
static HashMap defaults = new HashMap();
|
||||
|
||||
static {
|
||||
for (int j = 0; j < labelList.length; ++j) {
|
||||
String[] labels = labelList[j];
|
||||
|
||||
for (int i = 1; i < labels.length; ++i) {
|
||||
boolean hex = false;
|
||||
String def = null;
|
||||
//char appendChar = '\u0000';
|
||||
|
||||
// pull off "*": hex interpretation
|
||||
if (labels[i].charAt(0) == '*') { // HEX value
|
||||
hex = true;
|
||||
labels[i] = labels[i].substring(1);
|
||||
}
|
||||
|
||||
/*
|
||||
// pull off "$": append duplicates
|
||||
if (labels[i].charAt(0) == '$') { // HEX value
|
||||
appendChar = labels[i].charAt(1);
|
||||
labels[i] = labels[i].substring(2);
|
||||
}
|
||||
|
||||
// pull off default values
|
||||
int pos = labels[i].indexOf('-');
|
||||
if (pos >= 0) {
|
||||
def = labels[i].substring(pos+1);
|
||||
labels[i] = labels[i].substring(0,pos);
|
||||
}
|
||||
*/
|
||||
// store results
|
||||
// we do this after all processing, so that the label is clean!!
|
||||
|
||||
if (hex) isHex.put(labels[i], "");
|
||||
//if (appendChar != 0) appendDuplicates.put(labels[i], String.valueOf(appendChar));
|
||||
defaults.put(labels[i], def);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static List blockData = new LinkedList();
|
||||
|
||||
static void readBlocks() throws Exception {
|
||||
System.out.println("Reading 'Blocks'");
|
||||
BufferedReader input = Utility.openUnicodeFile(blocksname, version);
|
||||
String line = "";
|
||||
try {
|
||||
String[] parts = new String[20];
|
||||
for (int lineNumber = 1; ; ++lineNumber) {
|
||||
line = input.readLine();
|
||||
if (line == null) break;
|
||||
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
|
||||
|
||||
//String original = line;
|
||||
String comment = "";
|
||||
int commentPos = line.indexOf('#');
|
||||
if (commentPos >= 0) {
|
||||
comment = line.substring(commentPos+1).trim();
|
||||
line = line.substring(0, commentPos);
|
||||
}
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
int count = Utility.split(line,';',parts);
|
||||
if (count != 3) throw new ChainException("Bad count in Blocks", null);
|
||||
blockData.add(new String[] {Utility.fromHex(parts[0]), Utility.fromHex(parts[1]), parts[2].trim()});
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
System.out.println("Exception at: " + line);
|
||||
throw e;
|
||||
} finally {
|
||||
input.close();
|
||||
}
|
||||
}
|
||||
|
||||
static Set properties = new TreeSet();
|
||||
|
||||
static void readSemi(String[] labels) throws Exception {
|
||||
System.out.println();
|
||||
System.out.println("Reading '" + labels[0] + "'");
|
||||
if (major < 3 || (major == 3 && minor < 1)) {
|
||||
if (labels[0] == "PropList") {
|
||||
System.out.println("SKIPPING old format of Proplist for " + version);
|
||||
return;
|
||||
}
|
||||
}
|
||||
String tempVersion = version;
|
||||
if (version.equals(UCD.latestVersion)) tempVersion = "";
|
||||
BufferedReader input = Utility.openUnicodeFile(labels[0], tempVersion);
|
||||
if (input == null) {
|
||||
System.out.println("COULDN'T OPEN: " + labels[0]);
|
||||
return;
|
||||
}
|
||||
boolean showedSemi = false;
|
||||
boolean showedShort = false;
|
||||
String line = "";
|
||||
|
||||
try {
|
||||
String[] parts = new String[20];
|
||||
for (int lineNumber = 1; ; ++lineNumber) {
|
||||
line = input.readLine();
|
||||
if (line == null) break;
|
||||
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
|
||||
|
||||
String original = line;
|
||||
String comment = "";
|
||||
int commentPos = line.indexOf('#');
|
||||
if (commentPos >= 0) {
|
||||
comment = line.substring(commentPos+1).trim();
|
||||
line = line.substring(0, commentPos);
|
||||
}
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
int count = Utility.split(line,';',parts);
|
||||
|
||||
if (parts[0].equals("2801")) {
|
||||
System.out.println("debug?");
|
||||
}
|
||||
|
||||
// fix malformed or simple lists.
|
||||
|
||||
if (count != labels.length) {
|
||||
if (count == labels.length + 1 && parts[count-1].equals("")) {
|
||||
if (!showedSemi) System.out.println("Extra semicolon in: " + original);
|
||||
showedSemi = true;
|
||||
} else if (count == 1) { // fix simple list
|
||||
++count;
|
||||
parts[1] = "Y";
|
||||
} else if (count < labels.length) {
|
||||
if (!showedShort) System.out.println("Line shorter than labels: " + original);
|
||||
showedShort = true;
|
||||
for (int i = count; i < labels.length; ++i) {
|
||||
parts[i] = "";
|
||||
}
|
||||
} else {
|
||||
throw new ChainException("wrong count: {0}",
|
||||
new Object[] {new Integer(line), new Integer(count)});
|
||||
}
|
||||
}
|
||||
|
||||
// store char
|
||||
// first field is always character OR range. May be UTF-32
|
||||
int cpTop;
|
||||
int cpStart;
|
||||
int ddot = parts[0].indexOf(".");
|
||||
if (ddot >= 0) {
|
||||
cpStart = UTF32.char32At(Utility.fromHex(parts[0].substring(0,ddot)),0);
|
||||
cpTop = UTF32.char32At(Utility.fromHex(parts[0].substring(ddot+2)),0);
|
||||
System.out.println(Utility.hex(cpStart) + " ... " + Utility.hex(cpTop));
|
||||
} else {
|
||||
cpStart = UTF32.char32At(Utility.fromHex(parts[0]),0);
|
||||
cpTop = cpStart;
|
||||
if (labels[1].equals("RANGE")) UTF32.char32At(Utility.fromHex(parts[1]),0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// properties first
|
||||
if (labels[1].equals("PROP")) {
|
||||
String prop = parts[2].trim();
|
||||
// FIX!!
|
||||
boolean skipLetters = false;
|
||||
if (prop.equals("Alphabetic")) {
|
||||
prop = "Other_Alphabetic";
|
||||
skipLetters = true;
|
||||
}
|
||||
// END FIX!!
|
||||
properties.add(prop);
|
||||
if (Utility.find(prop, UCD_Names.DeletedProperties) == -1) { // only undeleted
|
||||
int end = UTF32.char32At(Utility.fromHex(parts[1]),0);
|
||||
if (end == 0) end = cpStart;
|
||||
|
||||
for (int j = cpStart; j <= end; ++j) {
|
||||
if (j != UCD.mapToRepresentative(j, false)) continue;
|
||||
if (skipLetters && getEntry(cpStart).isLetter()) continue;
|
||||
appendCharProperties(j, prop);
|
||||
}
|
||||
}
|
||||
} else { // not range!
|
||||
String val = "";
|
||||
String lastVal;
|
||||
|
||||
for (int i = 1; i < labels.length; ++i) {
|
||||
String key = labels[i];
|
||||
lastVal = val;
|
||||
if (isHex.get(key) != null) {
|
||||
val = Utility.fromHex(parts[i]);
|
||||
} else {
|
||||
val = parts[i].trim();
|
||||
}
|
||||
if (key.equals("OMIT")) continue; // do after val, so lastVal is correct
|
||||
if (key.equals("RANGE")) continue; // do after val, so lastVal is correct
|
||||
if (val.equals("")) continue; // skip empty values, they mean default
|
||||
|
||||
for (int cps = cpStart; cps <= cpTop; ++cps) {
|
||||
if (UCD.mapToRepresentative(cps, false) != cps) continue; // skip condensed ranges
|
||||
|
||||
if (key.equals("binary")) {
|
||||
appendCharProperties(cps, val);
|
||||
} else if (key.equals("fc")) {
|
||||
UData data = getEntry(cps);
|
||||
String type = parts[i-1].trim();
|
||||
if (type.equals("F") || type.equals("C") || type.equals("E") || type.equals("L")) {
|
||||
data.fullCaseFolding = val;
|
||||
//System.out.println("*<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
|
||||
}
|
||||
if (type.equals("S") || type.equals("C") || type.equals("L")) {
|
||||
data.simpleCaseFolding = val;
|
||||
//System.out.println("<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
|
||||
}
|
||||
if (type.equals("I")) {
|
||||
data.simpleCaseFolding = val;
|
||||
setBinaryProperty(cps, CaseFoldTurkishI);
|
||||
System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
|
||||
}
|
||||
} else {
|
||||
/*if (key.equals("sn")) { // SKIP UNDEFINED!!
|
||||
UData data = getEntryIfExists(cps);
|
||||
if (data == null || data.generalCategory == Cn) continue;
|
||||
}
|
||||
*/
|
||||
addCharData(cps, key, val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.out.println("Exception at: " + line + ", " + e.getMessage());
|
||||
throw e;
|
||||
} finally {
|
||||
input.close();
|
||||
}
|
||||
//printValues("JOINING_TYPE", jtSet);
|
||||
//printValues("JOINING_GROUP", jgSet);
|
||||
}
|
||||
|
||||
static void printValues(String title, Set s) {
|
||||
Iterator it = s.iterator();
|
||||
System.out.println("public static String[] " + title + " = {");
|
||||
while (it.hasNext()) {
|
||||
String value = (String) it.next();
|
||||
System.out.println(" \"" + value + "\",");
|
||||
}
|
||||
System.out.println("};");
|
||||
it = s.iterator();
|
||||
System.out.println("public static byte ");
|
||||
int count = 0;
|
||||
while (it.hasNext()) {
|
||||
String value = (String) it.next();
|
||||
System.out.println(" " + value.replace(' ', '-').toUpperCase() + " = " + (count++) + ",");
|
||||
}
|
||||
System.out.println(" LIMIT_" + title + " = " + count);
|
||||
System.out.println(";");
|
||||
}
|
||||
|
||||
static Map charData = new TreeMap();
|
||||
|
||||
static void writeXML() throws IOException {
|
||||
System.out.println("Writing 'UCD-Main.xml'");
|
||||
BufferedWriter output = new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(UCD.BIN_DIR + "UCD_Data.xml"),
|
||||
"UTF8"),
|
||||
32*1024);
|
||||
|
||||
try {
|
||||
// write header
|
||||
|
||||
output.write("<?xml version='1.0' encoding='utf-8'?>\r\n");
|
||||
output.write("<UnicodeCharacterDatabase>\r\n");
|
||||
output.write(" <!-- IMPORTANT: see UCD-Notes.html for information on the format. This file CANNOT be read correctly without that information. -->\r\n");
|
||||
output.write(" <unicode version='" + major + "' minor='" + minor + "' update='" + update + "'/>\r\n");
|
||||
output.write(" <fileVersion status='DRAFT' date='" + new Date() + "'/>\r\n");
|
||||
|
||||
// write blocks
|
||||
|
||||
Iterator it = blockData.iterator();
|
||||
while (it.hasNext()) {
|
||||
String[] block = (String[]) it.next();
|
||||
output.write(" <block start='" + Utility.quoteXML(block[0])
|
||||
+ "' end='" + Utility.quoteXML(block[1])
|
||||
+ "' name='" + Utility.quoteXML(block[2])
|
||||
+ "'/>\r\n" );
|
||||
}
|
||||
|
||||
// write char data
|
||||
|
||||
it = charData.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Integer cc = (Integer) it.next();
|
||||
output.write(" <e c='" + Utility.quoteXML(cc.intValue()) + "'" );
|
||||
/*
|
||||
UData data = (UData) charData.get(cc);
|
||||
Iterator dataIt = data.keySet().iterator();
|
||||
while (dataIt.hasNext()) {
|
||||
String label = (String) dataIt.next();
|
||||
if (label.equals("c")) continue; // already wrote it.
|
||||
if (label.equals("fc")) {
|
||||
String fc = getResolved(data, "fc");
|
||||
String lc = getResolved(data, "lc");
|
||||
if (!fc.equals(lc) && !lc.equals(cc)) log.println("FC " + fc.length() + ": " + toString(cc));
|
||||
}
|
||||
String value = Utility.quoteXML((String) data.get(label));
|
||||
output.write(" " + label + "='" + value + "'");
|
||||
}
|
||||
*/
|
||||
output.write("/>\r\n");
|
||||
}
|
||||
|
||||
// write footer
|
||||
|
||||
output.write("</UnicodeCharacterDatabase>\r\n");
|
||||
} finally {
|
||||
output.close();
|
||||
}
|
||||
}
|
||||
|
||||
static void writeJavaData() throws IOException {
|
||||
Iterator it = charData.keySet().iterator();
|
||||
int codePoint = -1;
|
||||
System.out.println("Writing " + dataFilePrefix + version);
|
||||
DataOutputStream dataOut = new DataOutputStream(
|
||||
new BufferedOutputStream(
|
||||
new FileOutputStream(UCD.BIN_DIR + dataFilePrefix + version + ".bin"),
|
||||
128*1024));
|
||||
|
||||
// write header
|
||||
dataOut.writeByte(BINARY_FORMAT);
|
||||
dataOut.writeByte(major);
|
||||
dataOut.writeByte(minor);
|
||||
dataOut.writeByte(update);
|
||||
long millis = System.currentTimeMillis();
|
||||
dataOut.writeLong(millis);
|
||||
dataOut.writeInt(charData.size());
|
||||
System.out.println("Data Size: " + NumberFormat.getInstance().format(charData.size()));
|
||||
int count = 0;
|
||||
|
||||
// write records
|
||||
try {
|
||||
// write char data
|
||||
|
||||
while (it.hasNext()) {
|
||||
Object cc = (Object) it.next();
|
||||
//codePoint = UTF32.char32At(cc,0);
|
||||
if (DEBUG) System.out.println(Utility.hex(cc));
|
||||
|
||||
UData uData = (UData) charData.get(cc);
|
||||
if (false && uData.name == null) {
|
||||
System.out.println("Warning: NULL name\r\n" + uData);
|
||||
System.out.println();
|
||||
}
|
||||
if (uData.codePoint == 0x2801) {
|
||||
System.out.println("SPOT-CHECK: " + uData);
|
||||
}
|
||||
uData.writeBytes(dataOut);
|
||||
count++;
|
||||
if (DEBUG) System.out.println("Setting2");
|
||||
}
|
||||
System.out.println("Wrote Data " + count);
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("Bad data write {0}", new Object [] {Utility.hex(codePoint)}, e);
|
||||
} finally {
|
||||
dataOut.close();
|
||||
}
|
||||
}
|
||||
|
||||
static String[] xsSplit = new String[40];
|
||||
|
||||
// Cache a little bit for speed
|
||||
static int getEntryCodePoint = -1;
|
||||
static UData getEntryUData = null;
|
||||
|
||||
static UData getEntryIfExists(int cp) {
|
||||
if (cp == getEntryCodePoint) return getEntryUData;
|
||||
Integer cc = new Integer(cp);
|
||||
UData charEntry = (UData) charData.get(cc);
|
||||
if (charEntry == null) return null;
|
||||
getEntryCodePoint = cp;
|
||||
getEntryUData = charEntry;
|
||||
return charEntry;
|
||||
}
|
||||
|
||||
/* Get entry in table for cc
|
||||
*/
|
||||
static UData getEntry(int cp) {
|
||||
if (cp == getEntryCodePoint) return getEntryUData;
|
||||
Integer cc = new Integer(cp);
|
||||
UData charEntry = (UData) charData.get(cc);
|
||||
if (charEntry == null) {
|
||||
charEntry = new UData(cp);
|
||||
charData.put(cc, charEntry);
|
||||
//charEntry.put("c", cc);
|
||||
}
|
||||
getEntryCodePoint = cp;
|
||||
getEntryUData = charEntry;
|
||||
return charEntry;
|
||||
}
|
||||
/** Adds the character data. Signals duplicates with an exception
|
||||
*/
|
||||
|
||||
static void setBinaryProperty(int cp, int binProp) {
|
||||
UData charEntry = getEntry(cp);
|
||||
charEntry.binaryProperties |= (1 << binProp);
|
||||
}
|
||||
|
||||
static void appendCharProperties(int cp, String key) {
|
||||
int ind;
|
||||
//if (true || NEWPROPS) {
|
||||
ind = Utility.lookup(key, UCD_Names.BP);
|
||||
/*} else {
|
||||
ind = Utility.lookup(key, UCD_Names.BP_OLD);
|
||||
}
|
||||
*/
|
||||
//charEntry.binaryProperties |= (1 << ind);
|
||||
setBinaryProperty(cp, ind);
|
||||
}
|
||||
|
||||
static Set jtSet = new TreeSet();
|
||||
static Set jgSet = new TreeSet();
|
||||
|
||||
/** Adds the character data. Signals duplicates with an exception
|
||||
*/
|
||||
static void addCharData(int cp, String key, String value) {
|
||||
//if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + Utility.quoteJavaString(value));
|
||||
UData charEntry = getEntry(cp);
|
||||
//if (cp < 10) System.out.println(" " + charEntry);
|
||||
|
||||
if (key.equals("bm")) {
|
||||
if (value.equals("Y")) charEntry.binaryProperties |= 1;
|
||||
} else if (key.equals("ce")) {
|
||||
charEntry.binaryProperties |= 2;
|
||||
} else if (key.equals("on")) {
|
||||
if (charEntry.name.charAt(0) == '<') {
|
||||
charEntry.name = '<' + value + '>';
|
||||
}
|
||||
} else if (key.equals("dm")) {
|
||||
charEntry.decompositionType = CANONICAL;
|
||||
if (value.charAt(0) == '<') {
|
||||
int pos = value.indexOf('>');
|
||||
String dType = value.substring(1,pos);
|
||||
if (major < 2) if (dType.charAt(0) == '+') dType = dType.substring(1);
|
||||
value = value.substring(pos+1);
|
||||
setField(charEntry, "dt", dType);
|
||||
}
|
||||
// FIX OLD
|
||||
if (major < 2) {
|
||||
int oldStyle = value.indexOf('<');
|
||||
if (oldStyle > 0) {
|
||||
value = value.substring(0,oldStyle);
|
||||
}
|
||||
oldStyle = value.indexOf('{');
|
||||
if (oldStyle > 0) {
|
||||
value = value.substring(0,oldStyle);
|
||||
}
|
||||
}
|
||||
setField(charEntry, key, Utility.fromHex(value));
|
||||
|
||||
// fix the numeric fields to be more sensible
|
||||
} else if (key.equals("dd")) {
|
||||
if (charEntry.numericType < UCD_Types.DECIMAL) {
|
||||
charEntry.numericType = UCD_Types.DECIMAL;
|
||||
}
|
||||
setField(charEntry, "nv", value);
|
||||
} else if (key.equals("dv")) {
|
||||
if (charEntry.numericType < UCD_Types.DIGIT) {
|
||||
charEntry.numericType = UCD_Types.DIGIT;
|
||||
}
|
||||
setField(charEntry, "nv", value);
|
||||
} else if (key.equals("nv")) {
|
||||
if (charEntry.numericType < UCD_Types.NUMERIC) {
|
||||
charEntry.numericType = UCD_Types.NUMERIC;
|
||||
}
|
||||
setField(charEntry, "nv", value);
|
||||
/*} else if (key.equals("jt")) {
|
||||
jtSet.add(value);
|
||||
} else if (key.equals("jg")) {
|
||||
jgSet.add(value);
|
||||
*/
|
||||
} else {
|
||||
setField(charEntry, key, value);
|
||||
}
|
||||
}
|
||||
|
||||
static public void setField(UData uData, String fieldName, String fieldValue) {
|
||||
try {
|
||||
if (fieldName.equals("n")) {
|
||||
uData.name = fieldValue;
|
||||
} else if (fieldName.equals("dm")) {
|
||||
uData.decompositionMapping = fieldValue;
|
||||
} else if (fieldName.equals("bg")) {
|
||||
uData.bidiMirror = fieldValue;
|
||||
} else if (fieldName.equals("uc")) {
|
||||
uData.simpleUppercase = fieldValue;
|
||||
} else if (fieldName.equals("lc")) {
|
||||
uData.simpleLowercase = fieldValue;
|
||||
} else if (fieldName.equals("tc")) {
|
||||
uData.simpleTitlecase = fieldValue;
|
||||
|
||||
} else if (fieldName.equals("su")) {
|
||||
uData.fullUppercase = fieldValue;
|
||||
} else if (fieldName.equals("sl")) {
|
||||
uData.fullLowercase = fieldValue;
|
||||
} else if (fieldName.equals("st")) {
|
||||
uData.fullTitlecase = fieldValue;
|
||||
|
||||
} else if (fieldName.equals("sc")) {
|
||||
uData.specialCasing = fieldValue;
|
||||
|
||||
} else if (fieldName.equals("xp")) {
|
||||
uData.binaryProperties |= 1 << Utility.lookup(fieldValue, UCD_Names.BP);
|
||||
//UCD_Names.BP_OLD
|
||||
|
||||
} else if (fieldName.equals("gc")) {
|
||||
uData.generalCategory = Utility.lookup(fieldValue, UCD_Names.GC);
|
||||
} else if (fieldName.equals("bc")) {
|
||||
uData.bidiClass = Utility.lookup(fieldValue, UCD_Names.BC);
|
||||
} else if (fieldName.equals("dt")) {
|
||||
if (major < 2) {
|
||||
if (fieldValue.equals("no-break")) fieldValue = "noBreak";
|
||||
else if (fieldValue.equals("circled")) fieldValue = "circle";
|
||||
else if (fieldValue.equals("sup")) fieldValue = "super";
|
||||
else if (fieldValue.equals("break")) fieldValue = "compat";
|
||||
else if (fieldValue.equals("font variant")) fieldValue = "font";
|
||||
else if (fieldValue.equals("no-join")) fieldValue = "compat";
|
||||
else if (fieldValue.equals("join")) fieldValue = "compat";
|
||||
}
|
||||
uData.decompositionType = Utility.lookup(fieldValue, UCD_Names.DT);
|
||||
} else if (fieldName.equals("nt")) {
|
||||
uData.numericType = Utility.lookup(fieldValue, UCD_Names.NT);
|
||||
|
||||
} else if (fieldName.equals("ea")) {
|
||||
uData.eastAsianWidth = Utility.lookup(fieldValue, UCD_Names.EA);
|
||||
} else if (fieldName.equals("lb")) {
|
||||
uData.lineBreak = Utility.lookup(fieldValue, UCD_Names.LB);
|
||||
|
||||
} else if (fieldName.equals("sn")) {
|
||||
uData.script = Utility.lookup(fieldValue, UCD_Names.SCRIPT);
|
||||
|
||||
} else if (fieldName.equals("jt")) {
|
||||
uData.joiningType = Utility.lookup(fieldValue, UCD_Names.JOINING_TYPE);
|
||||
} else if (fieldName.equals("jg")) {
|
||||
uData.joiningGroup = Utility.lookup(fieldValue, UCD_Names.OLD_JOINING_GROUP);
|
||||
|
||||
} else if (fieldName.equals("nv")) {
|
||||
if (major < 2) {
|
||||
if (fieldValue.equals("-")) return;
|
||||
}
|
||||
uData.numericValue = Utility.floatFrom(fieldValue);
|
||||
} else if (fieldName.equals("cc")) {
|
||||
uData.combiningClass = (byte)Utility.intFrom(fieldValue);
|
||||
} else if (fieldName.equals("bp")) {
|
||||
uData.binaryProperties = (byte)Utility.intFrom(fieldValue);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unknown fieldName");
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new ChainException(
|
||||
"Bad field name= \"{0}\", value= \"{1}\"", new Object[] {fieldName, fieldValue}, e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
440
tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java
Normal file
440
tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java
Normal file
|
@ -0,0 +1,440 @@
|
|||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
final class DerivedPropertyLister extends PropertyLister {
|
||||
static final boolean BRIDGE = false;
|
||||
|
||||
static int enum = 0;
|
||||
static final int
|
||||
PropMath = 0,
|
||||
PropAlphabetic = 1,
|
||||
PropLowercase = 2,
|
||||
PropUppercase = 3,
|
||||
|
||||
ID_Start = 4,
|
||||
ID_Continue_NO_Cf = 5,
|
||||
|
||||
Mod_ID_Start = 6,
|
||||
Mod_ID_Continue_NO_Cf = 7,
|
||||
|
||||
Missing_Uppercase = 8,
|
||||
Missing_Lowercase = 9,
|
||||
Missing_Mixedcase = 10,
|
||||
|
||||
FC_NFKC_Closure = 11,
|
||||
|
||||
FullCompExclusion = 12,
|
||||
FullCompInclusion = 13,
|
||||
|
||||
QuickNFD = 14,
|
||||
QuickNFC = 15,
|
||||
QuickNFKD = 16,
|
||||
QuickNFKC = 17,
|
||||
|
||||
ExpandsOnNFD = 18,
|
||||
ExpandsOnNFC = 19,
|
||||
ExpandsOnNFKD = 20,
|
||||
ExpandsOnNFKC = 21,
|
||||
|
||||
GenNFD = 22,
|
||||
GenNFC = 23,
|
||||
GenNFKD = 24,
|
||||
GenNFKC = 25,
|
||||
|
||||
LIMIT = 26;
|
||||
;
|
||||
|
||||
private int propMask;
|
||||
private Normalizer[] nf = new Normalizer[4];
|
||||
private Normalizer nfd, nfc, nfkd, nfkc;
|
||||
int width;
|
||||
|
||||
public DerivedPropertyLister(UCD ucd, int propMask, PrintStream output) {
|
||||
this.propMask = propMask;
|
||||
this.output = output;
|
||||
this.ucdData = ucd;
|
||||
nfd = nf[0] = new Normalizer(Normalizer.NFD);
|
||||
nfc = nf[1] = new Normalizer(Normalizer.NFC);
|
||||
nfkd = nf[2] = new Normalizer(Normalizer.NFKD);
|
||||
nfkc = nf[3] = new Normalizer(Normalizer.NFKC);
|
||||
|
||||
width = super.minPropertyWidth();
|
||||
switch (propMask) {
|
||||
case GenNFD: case GenNFC: case GenNFKD: case GenNFKC:
|
||||
alwaysBreaks = true;
|
||||
break;
|
||||
case FC_NFKC_Closure:
|
||||
alwaysBreaks = true;
|
||||
width = 21;
|
||||
break;
|
||||
case QuickNFC: case QuickNFKC:
|
||||
width = 11;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
public String headerString() {
|
||||
String result = "# Derived Property: ";
|
||||
switch (propMask) {
|
||||
case ExpandsOnNFD: case ExpandsOnNFC: case ExpandsOnNFKD: case ExpandsOnNFKC:
|
||||
result += "Expands_On_" + NAME[propMask-ExpandsOnNFD] + "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# Characters whose normalized length is not one."
|
||||
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
|
||||
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
|
||||
break;
|
||||
case GenNFD: case GenNFC: case GenNFKD: case GenNFKC:
|
||||
result += NAME[propMask-GenNFD] + "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# Normalized forms, where different from the characters themselves."
|
||||
+ ((propMask == 5 || propMask == 3)
|
||||
? ""
|
||||
: "\r\n# HANGUL SYLLABLES are algorithmically decomposed, and not listed explicitly.")
|
||||
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
|
||||
+ "\r\n# It is NOT sufficient to replace characters one-by-one with these results!";
|
||||
break;
|
||||
case ID_Start: result +=
|
||||
"ID_Start"
|
||||
+ "\r\n# Characters that can start an identifier."
|
||||
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl";
|
||||
break;
|
||||
case ID_Continue_NO_Cf: result +=
|
||||
"ID_Continue"
|
||||
+ "\r\n# Characters that can continue an identifier."
|
||||
+ "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc"
|
||||
+ "\r\n# NOTE: Cf characters should be filtered out.";
|
||||
break;
|
||||
case Mod_ID_Start: result +=
|
||||
"XID_Start"
|
||||
+ "\r\n# ID_Start modified for closure under NFKx"
|
||||
+ "\r\n# Modified as described in UAX #15"
|
||||
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
|
||||
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
|
||||
break;
|
||||
case Mod_ID_Continue_NO_Cf: result +=
|
||||
"XID_Continue"
|
||||
+ "\r\n# Mod_ID_Continue modified for closure under NFKx"
|
||||
+ "\r\n# Modified as described in UAX #15"
|
||||
+ "\r\n# NOTE: Cf characters should be filtered out."
|
||||
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
|
||||
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
|
||||
break;
|
||||
case PropMath:
|
||||
result += "Math"
|
||||
+ "\r\n# Generated from: Sm + Other_Math";
|
||||
break;
|
||||
case PropAlphabetic:
|
||||
result += "Alphabetic"
|
||||
+ "\r\n# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic";
|
||||
break;
|
||||
case PropLowercase:
|
||||
result += "Lowercase"
|
||||
+ "\r\n# Generated from: Ll + Other_Lowercase";
|
||||
break;
|
||||
case PropUppercase: result +=
|
||||
"Uppercase"
|
||||
+ "\r\n# Generated from: Lu + Other_Uppercase";
|
||||
break;
|
||||
case Missing_Uppercase: result +=
|
||||
"Missing_Uppercase"
|
||||
+ "\r\n# Generated from: NFKD has >0 Uppercase, no other cases";
|
||||
break;
|
||||
case Missing_Lowercase: result +=
|
||||
"Missing_Lowercase"
|
||||
+ "\r\n# Generated from: NFKD has >0 Lowercase, no other cases";
|
||||
break;
|
||||
case Missing_Mixedcase: result +=
|
||||
"Missing_Mixedcase"
|
||||
+ "\r\n# Generated from: NFKD has >0 Mixedcase, no other cases";
|
||||
break;
|
||||
case FullCompExclusion: result +=
|
||||
"Full Composition Exclusion"
|
||||
+ "\r\n# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions";
|
||||
break;
|
||||
case FullCompInclusion: result +=
|
||||
"Full Composition Inclusion"
|
||||
+ "\r\n# characters with Canonical Decompositions MINUS Full Composition Exclusion";
|
||||
break;
|
||||
case FC_NFKC_Closure: result +=
|
||||
"FC_NFKC_Closure"
|
||||
+ "\r\n# Generated from computing: b = NFKC(Fold(a)); c = NFKC(Fold(b));"
|
||||
+ "\r\n# Then if (c != b) add the mapping from a to c to the set of"
|
||||
+ "\r\n# mappings that constitute the FC_NFKC_Closure list";
|
||||
break;
|
||||
case QuickNFD: case QuickNFC: case QuickNFKD: case QuickNFKC:
|
||||
result += NAME[propMask-QuickNFD] + "_QuickCheck"
|
||||
+ "\r\n# Generated from computing decomposibles"
|
||||
+ ((propMask == QuickNFC || propMask == QuickNFKC)
|
||||
? " (and characters that may compose with previous ones)" : "");
|
||||
break;
|
||||
default: result += "Unimplemented!!";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public String propertyName(int cp) {
|
||||
switch (propMask) {
|
||||
case ExpandsOnNFD: case ExpandsOnNFC: case ExpandsOnNFKD: case ExpandsOnNFKC:
|
||||
return "Expands_On_" + NAME[propMask-ExpandsOnNFD];
|
||||
case GenNFD: case GenNFC: case GenNFKD: case GenNFKC:
|
||||
if (cp >= 0xAC00 && cp <= 0xD7A3) return NAME[propMask-GenNFD] + "; " + "<algorithmic normalization>";
|
||||
String norm = Utility.hex(nf[propMask-GenNFD].normalize(cp));
|
||||
String pad = Utility.repeat(" ", 14-norm.length());
|
||||
return NAME[propMask-GenNFD] + "; " + norm + pad;
|
||||
case ID_Start: return "ID_Start";
|
||||
case ID_Continue_NO_Cf: return "ID_Continue";
|
||||
case Mod_ID_Start: return "XID_Start";
|
||||
case Mod_ID_Continue_NO_Cf: return "XID_Continue";
|
||||
case PropMath: return "Math";
|
||||
case PropAlphabetic: return "Alphabetic";
|
||||
case PropLowercase: return "Lowercase";
|
||||
case PropUppercase: return "Uppercase";
|
||||
case Missing_Uppercase: return "Possible_Missing_Uppercase";
|
||||
case Missing_Lowercase: return "Possible_Missing_Lowercase";
|
||||
case Missing_Mixedcase: return "Possible_Missing_Titlecase";
|
||||
case FullCompExclusion: return "Comp_Ex";
|
||||
case FullCompInclusion: return "Comp_In";
|
||||
case FC_NFKC_Closure: return "FNC; " + Utility.hex(getComputedValue(cp));
|
||||
case QuickNFD: case QuickNFC: case QuickNFKD: case QuickNFKC:
|
||||
return NAME[propMask-QuickNFD] + "_" + getComputedValue(cp);
|
||||
default: return "Unimplemented!!";
|
||||
}
|
||||
}
|
||||
|
||||
//public String optionalComment(int cp) {
|
||||
// return super.optionalComment(cp) + " [" + ucdData.getCodeAndName(computedValue) + "]";
|
||||
//}
|
||||
|
||||
|
||||
public int minPropertyWidth() {
|
||||
return width;
|
||||
}
|
||||
|
||||
|
||||
static final String[] NAME = {"NFD", "NFC", "NFKD", "NFKC"};
|
||||
/*
|
||||
public String optionalComment(int cp) {
|
||||
String id = ucdData.getCategoryID(cp);
|
||||
if (UCD.mainCategoryMask(ucdData.getCategory(cp)) == LETTER_MASK) return id.substring(0,1) + "*";
|
||||
return id;
|
||||
}
|
||||
*/
|
||||
/*
|
||||
public String optionalName(int cp) {
|
||||
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
|
||||
return Utility.hex(ucdData.getDecompositionMapping(cp));
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
public byte status(int cp) {
|
||||
if (!ucdData.isAssigned(cp)) return EXCLUDE;
|
||||
//if (cp == 0xFFFF) {
|
||||
// System.out.println("# " + Utility.hex(cp));
|
||||
//}
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
//if (cp == 0x0385) {
|
||||
// System.out.println(Utility.hex(firstRealCp));
|
||||
//}
|
||||
|
||||
String cps;
|
||||
byte xCat;
|
||||
|
||||
switch (propMask) {
|
||||
default: return EXCLUDE;
|
||||
|
||||
case ExpandsOnNFD: case ExpandsOnNFC: case ExpandsOnNFKD: case ExpandsOnNFKC:
|
||||
if (ucdData.getDecompositionType(cp) == NONE) return EXCLUDE;
|
||||
cps = UTF32.valueOf32(cp);
|
||||
if (UTF32.length32(nf[propMask-ExpandsOnNFD].normalize(cps)) == UTF32.length32(cps)) return EXCLUDE;
|
||||
break;
|
||||
case GenNFD: case GenNFC: case GenNFKD: case GenNFKC:
|
||||
if (ucdData.getDecompositionType(cp) == NONE) return EXCLUDE;
|
||||
cps = UTF32.valueOf32(cp);
|
||||
if (cps.equals(nf[propMask-GenNFD].normalize(cps))) {
|
||||
return EXCLUDE;
|
||||
}
|
||||
if (cp >= 0xAC00 && cp <= 0xD7A3) return INCLUDE;
|
||||
//System.out.println(Utility.hex(cps) + " => " + Utility.hex(nf[propMask-4].normalize(cps)));
|
||||
return BREAK;
|
||||
case ID_Start:
|
||||
if (ucdData.isIdentifierStart(cp, false)) return INCLUDE;
|
||||
return EXCLUDE;
|
||||
case ID_Continue_NO_Cf:
|
||||
if (ucdData.isIdentifierContinue_NO_Cf(cp, false)) return INCLUDE;
|
||||
return EXCLUDE;
|
||||
case Mod_ID_Start:
|
||||
if (ucdData.isIdentifierStart(cp, true)) return INCLUDE;
|
||||
return EXCLUDE;
|
||||
case Mod_ID_Continue_NO_Cf:
|
||||
if (ucdData.isIdentifierContinue_NO_Cf(cp, true)) return INCLUDE;
|
||||
return EXCLUDE;
|
||||
case PropMath:
|
||||
if (cat == Sm
|
||||
|| ucdData.getBinaryProperty(cp,Math_Property)) return INCLUDE;
|
||||
return EXCLUDE;
|
||||
case PropAlphabetic:
|
||||
if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl
|
||||
|| ucdData.getBinaryProperty(cp, Alphabetic)) return INCLUDE;
|
||||
case PropLowercase:
|
||||
if (cat == Ll
|
||||
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return INCLUDE;
|
||||
return EXCLUDE;
|
||||
case PropUppercase:
|
||||
if (cat == Lu
|
||||
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return INCLUDE;
|
||||
return EXCLUDE;
|
||||
case Missing_Uppercase:
|
||||
if (cat == Lu
|
||||
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return EXCLUDE;
|
||||
xCat = getDecompCat(cp);
|
||||
if (xCat == Lu) return INCLUDE;
|
||||
return EXCLUDE;
|
||||
case Missing_Lowercase:
|
||||
if (cat == Ll
|
||||
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return EXCLUDE;
|
||||
xCat = getDecompCat(cp);
|
||||
if (xCat == Ll) return INCLUDE;
|
||||
return EXCLUDE;
|
||||
case Missing_Mixedcase:
|
||||
if (cat == Lt) return EXCLUDE;
|
||||
xCat = getDecompCat(cp);
|
||||
if (xCat == Lt) return INCLUDE;
|
||||
return EXCLUDE;
|
||||
case FullCompExclusion:
|
||||
/*
|
||||
(3) Singleton Decompositions: characters that can be derived from the UnicodeData file by
|
||||
including all characters whose canonical decomposition consists of a single character.
|
||||
(4) Non-Starter Decompositions: characters that can be derived from the UnicodeData
|
||||
file by including all characters whose canonical decomposition consists of a sequence
|
||||
of characters, the first of which has a non-zero combining class.
|
||||
*/
|
||||
{
|
||||
if (!ucdData.isRepresented(cp)) return EXCLUDE;
|
||||
byte dtype = ucdData.getDecompositionType(cp);
|
||||
if (dtype != CANONICAL) return EXCLUDE;
|
||||
|
||||
if (isCompEx(cp)) return INCLUDE;
|
||||
return EXCLUDE;
|
||||
}
|
||||
case FullCompInclusion:
|
||||
{
|
||||
if (!ucdData.isRepresented(cp)) return EXCLUDE;
|
||||
byte dtype = ucdData.getDecompositionType(cp);
|
||||
if (dtype != CANONICAL) return EXCLUDE;
|
||||
|
||||
if (isCompEx(cp)) return EXCLUDE;
|
||||
return INCLUDE;
|
||||
}
|
||||
case FC_NFKC_Closure:
|
||||
if (!ucdData.isRepresented(cp)) return EXCLUDE;
|
||||
|
||||
/*
|
||||
b = Normalize(Fold(a));
|
||||
c = Normalize(Fold(b));
|
||||
if (c != b) add a => c
|
||||
*/
|
||||
{
|
||||
String b = nfkc.normalize(fold(cp));
|
||||
String c = nfkc.normalize(fold(b));
|
||||
if (c.equals(b)) return EXCLUDE;
|
||||
setComputedValue(cp, c);
|
||||
if (cp == 0x1F88) {
|
||||
System.out.println(ucdData.toString(cp));
|
||||
System.out.println("cp: " + ucdData.getCodeAndName(cp));
|
||||
System.out.println("fold(cp): " + ucdData.getCodeAndName(fold(cp)));
|
||||
System.out.println("b: " + ucdData.getCodeAndName(b));
|
||||
System.out.println("fold(b): " + ucdData.getCodeAndName(fold(b)));
|
||||
System.out.println("c: " + ucdData.getCodeAndName(c));
|
||||
}
|
||||
return BREAK;
|
||||
}
|
||||
|
||||
case QuickNFD: case QuickNFC: case QuickNFKD: case QuickNFKC:
|
||||
lastValue = currentValue;
|
||||
Normalizer nfx = nf[propMask - QuickNFD];
|
||||
if (nfx.normalizationDiffers(cp)) currentValue = "NO";
|
||||
else if (nfx.isTrailing(cp)) currentValue = "MAYBE";
|
||||
else return EXCLUDE;
|
||||
setComputedValue(cp, currentValue);
|
||||
if (currentValue != lastValue) return BREAK;
|
||||
return INCLUDE;
|
||||
}
|
||||
|
||||
|
||||
// handle script stuff
|
||||
/*
|
||||
if (firstRealCp == -1) return INCLUDE;
|
||||
byte cat2 = ucdData.getCategory(firstRealCp);
|
||||
if (cat == cat2) return INCLUDE;
|
||||
int mc = UCD.mainCategoryMask(cat);
|
||||
if (LETTER_MASK == mc && mc == UCD.mainCategoryMask(cat2)) return INCLUDE;
|
||||
|
||||
return BREAK;
|
||||
*/
|
||||
return INCLUDE;
|
||||
}
|
||||
|
||||
static Map computedValue = new HashMap();
|
||||
static String getComputedValue(int cp) {
|
||||
return (String) computedValue.get(new Integer(cp));
|
||||
}
|
||||
static void setComputedValue(int cp, String value) {
|
||||
computedValue.put(new Integer(cp), value);
|
||||
}
|
||||
static String lastValue = "";
|
||||
static String currentValue = "";
|
||||
|
||||
boolean isCompEx(int cp) {
|
||||
if (ucdData.getBinaryProperty(cp, CompositionExclusion)) return true;
|
||||
String decomp = ucdData.getDecompositionMapping(cp);
|
||||
if (UTF32.length32(decomp) == 1) return true;
|
||||
int first = UTF32.char32At(decomp,0);
|
||||
if (ucdData.getCombiningClass(first) != 0) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
StringBuffer foldBuffer = new StringBuffer();
|
||||
|
||||
String fold(int cp) {
|
||||
return ucdData.getCase(cp, FULL, FOLD);
|
||||
}
|
||||
|
||||
String fold(String s) {
|
||||
return ucdData.getCase(s, FULL, FOLD);
|
||||
}
|
||||
|
||||
byte getDecompCat(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Lu
|
||||
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return Lu;
|
||||
if (cat == Ll
|
||||
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return Ll;
|
||||
if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) return cat;
|
||||
if (!nf[2].normalizationDiffers(cp)) return Lo;
|
||||
|
||||
String norm = nf[2].normalize(cp);
|
||||
int cp2;
|
||||
boolean gotUpper = false;
|
||||
boolean gotLower = false;
|
||||
boolean gotTitle = false;
|
||||
for (int i = 0; i < norm.length(); i += UTF32.count16(cp2)) {
|
||||
cp2 = UTF32.char32At(norm, i);
|
||||
byte catx = ucdData.getCategory(cp2);
|
||||
boolean upx = ucdData.getBinaryProperty(cp, Other_Uppercase);
|
||||
boolean lowx = ucdData.getBinaryProperty(cp, Other_Lowercase);
|
||||
if (catx == Ll || lowx || cp2 == 0x345) gotLower = true;
|
||||
if (catx == Lu || upx) gotUpper = true;
|
||||
if (catx == Lt) gotTitle = true;
|
||||
}
|
||||
if (gotLower && !gotUpper && !gotTitle) return Ll;
|
||||
if (!gotLower && gotUpper && !gotTitle) return Lu;
|
||||
if (gotLower || gotUpper || gotTitle) return Lt;
|
||||
return cat;
|
||||
}
|
||||
}
|
||||
|
65
tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java
Normal file
65
tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java
Normal file
|
@ -0,0 +1,65 @@
|
|||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
|
||||
class DiffPropertyLister extends PropertyLister {
|
||||
private UCD oldUCD;
|
||||
|
||||
public DiffPropertyLister(String oldUCDName, String newUCDName, PrintStream output) {
|
||||
this.output = output;
|
||||
this.ucdData = UCD.make(newUCDName);
|
||||
if (oldUCDName != null) this.oldUCD = UCD.make(oldUCDName);
|
||||
}
|
||||
|
||||
public byte status (int cp) {
|
||||
return INCLUDE;
|
||||
}
|
||||
|
||||
public String propertyName(int cp) {
|
||||
return ucdData.getVersion();
|
||||
}
|
||||
|
||||
/*
|
||||
public String optionalName(int cp) {
|
||||
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
|
||||
return Utility.hex(ucdData.getDecompositionMapping(cp));
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
public byte status(int lastCp, int cp) {
|
||||
/*if (cp == 0xFFFF) {
|
||||
System.out.println("# " + Utility.hex(cp));
|
||||
}
|
||||
*/
|
||||
return ucdData.isAllocated(cp) && (oldUCD == null || !oldUCD.isAllocated(cp)) ? INCLUDE : EXCLUDE;
|
||||
}
|
||||
|
||||
public int print() {
|
||||
String status;
|
||||
if (oldUCD != null) {
|
||||
status = "# Differences between " + ucdData.getVersion() + " and " + oldUCD.getVersion();
|
||||
} else {
|
||||
status = "# Allocated as of " + ucdData.getVersion();
|
||||
}
|
||||
output.println();
|
||||
output.println();
|
||||
output.println(status);
|
||||
output.println();
|
||||
System.out.println(status);
|
||||
int count = super.print();
|
||||
output.println();
|
||||
if (oldUCD != null) {
|
||||
output.println("# Total " + count + " new code points allocated in " + ucdData.getVersion());
|
||||
} else {
|
||||
output.println("# Total " + count + " code points allocated in " + ucdData.getVersion());
|
||||
}
|
||||
|
||||
output.println();
|
||||
return count;
|
||||
}
|
||||
|
||||
}
|
||||
|
342
tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java
Normal file
342
tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java
Normal file
|
@ -0,0 +1,342 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public class GenerateCaseFolding implements UCD_Types {
|
||||
public static boolean DEBUG = false;
|
||||
public static UCD ucd = UCD.make("310");
|
||||
|
||||
public static void main(String[] args) throws java.io.IOException {
|
||||
makeCaseFold();
|
||||
//getAge();
|
||||
}
|
||||
|
||||
public static void makeCaseFold() throws java.io.IOException {
|
||||
System.out.println("Making Full Data");
|
||||
Map fullData = getCaseFolding(true);
|
||||
System.out.println("Making Simple Data");
|
||||
Map simpleData = getCaseFolding(false);
|
||||
// write the data
|
||||
|
||||
System.out.println("Writing");
|
||||
PrintWriter out = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream("CaseFoldingSample.txt"),
|
||||
"UTF8"),
|
||||
4*1024));
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
String rFull = (String)fullData.get(UTF32.valueOf32(ch));
|
||||
String rSimple = (String)simpleData.get(UTF32.valueOf32(ch));
|
||||
if (rFull == null && rSimple == null) continue;
|
||||
if (rFull != null && rFull.equals(rSimple)) {
|
||||
String type = "C";
|
||||
if (ch == 0x130 || ch == 0x131) type = "I";
|
||||
drawLine(out, ch, type, rFull);
|
||||
} else {
|
||||
if (rFull != null) {
|
||||
drawLine(out, ch, "F", rFull);
|
||||
}
|
||||
if (rSimple != null) {
|
||||
drawLine(out, ch, "S", rSimple);
|
||||
}
|
||||
}
|
||||
}
|
||||
out.close();
|
||||
}
|
||||
|
||||
static void drawLine(PrintWriter out, int ch, String type, String result) {
|
||||
out.println(Utility.hex(ch)
|
||||
+ "; " + type +
|
||||
"; " + Utility.hex(result, " ") +
|
||||
"; # " + ucd.getName(ch));
|
||||
}
|
||||
|
||||
|
||||
static Map getCaseFolding(boolean full) throws java.io.IOException {
|
||||
Map data = new TreeMap();
|
||||
Map repChar = new TreeMap();
|
||||
//String option = "";
|
||||
|
||||
// get the equivalence classes
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
|
||||
if (!ucd.isRepresented(ch)) continue;
|
||||
getClosure(ch, data, full);
|
||||
}
|
||||
|
||||
// get the representative characters
|
||||
|
||||
Iterator it = data.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
Set set = (Set) data.get(s);
|
||||
String rep = null;
|
||||
int repGood = 0;
|
||||
String dup = null;
|
||||
Iterator it2 = set.iterator();
|
||||
while (it2.hasNext()) {
|
||||
String s2 = (String)it2.next();
|
||||
int s2Good = goodness(s2, full);
|
||||
if (s2Good > repGood) {
|
||||
rep = s2;
|
||||
repGood = s2Good;
|
||||
dup = null;
|
||||
} else if (s2Good == repGood) {
|
||||
dup = s2;
|
||||
}
|
||||
}
|
||||
if (rep == null) System.err.println("No representative for: " + toString(set));
|
||||
else if (repGood < 128) {
|
||||
System.err.println("Non-optimal!!: "
|
||||
+ ucd.getName(rep) + ", " + toString(set,true));
|
||||
}
|
||||
it2 = set.iterator();
|
||||
while (it2.hasNext()) {
|
||||
String s2 = (String)it2.next();
|
||||
if (s2.length() == 1 && !s2.equals(rep)) repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
|
||||
}
|
||||
}
|
||||
return repChar;
|
||||
}
|
||||
|
||||
static int goodness(String s, boolean full) {
|
||||
if (s == null) return 0;
|
||||
int result = s.length();
|
||||
if (s.equals(lower(upper(s, full), full))) result |= 128;
|
||||
if (s.equals(NFC.normalize(s))) result |= 64;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static Normalizer NFC = new Normalizer(Normalizer.NFC);
|
||||
/*
|
||||
static HashSet temp = new HashSet();
|
||||
static void normalize(HashSet set) {
|
||||
temp.clear();
|
||||
temp.addAll(set);
|
||||
set.clear();
|
||||
Iterator it = temp.iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
String s2 = KC.normalize(s);
|
||||
set.add(s);
|
||||
data2.put(s,set);
|
||||
if (!s.equals(s2)) {
|
||||
set.add(s2);
|
||||
data2.put(s2,set);
|
||||
System.err.println("Adding " + Utility.hex(s) + " by " + Utility.hex(s2));
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
String
|
||||
String lower1 = ucd.getLowercase(ch);
|
||||
String lower2 = ucd.toLowercase(ch,option);
|
||||
|
||||
char ch2 = ucd.getLowercase(ucd.getUppercase(ch).charAt(0)).charAt(0);
|
||||
//String lower1 = String.valueOf(ucd.getLowercase(ch));
|
||||
//String lower = ucd.toLowercase(ch2,option);
|
||||
String upper = ucd.toUppercase(ch2,option);
|
||||
String lowerUpper = ucd.toLowercase(upper,option);
|
||||
//String title = ucd.toTitlecase(ch2,option);
|
||||
//String lowerTitle = ucd.toLowercase(upper,option);
|
||||
|
||||
if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { //
|
||||
output.println(Utility.hex(ch)
|
||||
+ "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E")
|
||||
+ "; " + Utility.hex(lowerUpper," ")
|
||||
+ ";\t#" + ucd.getName(ch)
|
||||
);
|
||||
//if (!lowerUpper.equals(lower)) {
|
||||
// output.println("Warning1: " + Utility.hex(lower) + " " + ucd.getName(lower));
|
||||
//}
|
||||
//if (!lowerUpper.equals(lowerTitle)) {
|
||||
// output.println("Warning2: " + Utility.hex(lowerTitle) + " " + ucd.getName(lowerTitle));
|
||||
//}
|
||||
}
|
||||
*/
|
||||
|
||||
static void getClosure(int ch, Map data, boolean full) {
|
||||
String charStr = UTF32.valueOf32(ch);
|
||||
String lowerStr = lower(charStr, full);
|
||||
String titleStr = title(charStr, full);
|
||||
String upperStr = upper(charStr, full);
|
||||
if (charStr.equals(lowerStr) && charStr.equals(upperStr) && charStr.equals(titleStr)) return;
|
||||
if (DEBUG) System.err.println("Closure for " + Utility.hex(ch));
|
||||
|
||||
// make new set
|
||||
Set set = new TreeSet();
|
||||
set.add(charStr);
|
||||
data.put(charStr, set);
|
||||
|
||||
// add cases to get started
|
||||
add(set, lowerStr, data);
|
||||
add(set, upperStr, data);
|
||||
add(set, titleStr, data);
|
||||
|
||||
// close it
|
||||
main:
|
||||
while (true) {
|
||||
Iterator it = set.iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
// do funny stuff since we can't modify set while iterating
|
||||
//if (add(set, NFC.normalize(s), data)) continue main;
|
||||
if (add(set, lower(s, full), data)) continue main;
|
||||
if (add(set, title(s, full), data)) continue main;
|
||||
if (add(set, upper(s, full), data)) continue main;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static String lower(String s, boolean full) {
|
||||
String result = lower2(s,full);
|
||||
return result.replace('\u03C2', '\u03C3'); // HACK for lower
|
||||
}
|
||||
|
||||
// These functions are no longer necessary, since UCD is parameterized,
|
||||
// but it's not worth changing
|
||||
|
||||
static String lower2(String s, boolean full) {
|
||||
if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
return ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
|
||||
}
|
||||
return ucd.getCase(s, FULL, LOWER);
|
||||
}
|
||||
|
||||
static String upper(String s, boolean full) {
|
||||
if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
return ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
|
||||
}
|
||||
return ucd.getCase(s, SIMPLE, UPPER);
|
||||
}
|
||||
|
||||
static String title(String s, boolean full) {
|
||||
if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
return ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
|
||||
}
|
||||
return ucd.getCase(s, SIMPLE, TITLE);
|
||||
}
|
||||
|
||||
static boolean add(Set set, String s, Map data) {
|
||||
if (set.contains(s)) return false;
|
||||
set.add(s);
|
||||
if (DEBUG) System.err.println("adding: " + toString(set));
|
||||
Set other = (Set) data.get(s);
|
||||
if (other != null && other != set) { // merge
|
||||
// make all the items in set point to merged set
|
||||
Iterator it = other.iterator();
|
||||
while (it.hasNext()) {
|
||||
data.put(it.next(), set);
|
||||
}
|
||||
set.addAll(other);
|
||||
}
|
||||
if (DEBUG) System.err.println("done adding: " + toString(set));
|
||||
return true;
|
||||
}
|
||||
|
||||
static String toString(Set set) {
|
||||
String result = "{";
|
||||
Iterator it2 = set.iterator();
|
||||
boolean first = true;
|
||||
while (it2.hasNext()) {
|
||||
String s2 = (String) it2.next();
|
||||
if (!first) result += ", ";
|
||||
first = false;
|
||||
result += Utility.hex(s2, " ");
|
||||
}
|
||||
return result + "}";
|
||||
}
|
||||
|
||||
static String toString(Set set, boolean t) {
|
||||
String result = "{";
|
||||
Iterator it2 = set.iterator();
|
||||
boolean first = true;
|
||||
while (it2.hasNext()) {
|
||||
String s2 = (String) it2.next();
|
||||
if (!first) result += ", ";
|
||||
first = false;
|
||||
result += ucd.getName(s2);
|
||||
}
|
||||
return result + "}";
|
||||
}
|
||||
|
||||
static final void getAge() throws IOException {
|
||||
PrintStream log = new PrintStream(
|
||||
new BufferedOutputStream (
|
||||
new FileOutputStream("UnicodeAge.txt"),
|
||||
4*1024));
|
||||
try {
|
||||
log.println("# Derived file showing when various code points were allocated in Unicode");
|
||||
log.println("# author: M. Davis");
|
||||
log.println("# generated: " + new Date());
|
||||
log.println("# Notes:");
|
||||
log.println("# - The old Hangul Syllables (removed from 2.0) are not included in the 110 listing.");
|
||||
log.println("# - The supplementary private use code points, although allocated earlier,");
|
||||
log.println("# were NOT specifically listed in the UCD until 3.0.1, and are not included until then.");
|
||||
new DiffPropertyLister(null, "110", log).print();
|
||||
new DiffPropertyLister("110", "200", log).print();
|
||||
new DiffPropertyLister("200", "210", log).print();
|
||||
new DiffPropertyLister("210", "300", log).print();
|
||||
new DiffPropertyLister("300", "310", log).print();
|
||||
/*
|
||||
printDiff("110", "200");
|
||||
UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false);
|
||||
UnicodeSet u20 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.0.txt", false);
|
||||
UnicodeSet u21 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.1.txt", false);
|
||||
UnicodeSet u30 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.0.txt", false);
|
||||
UnicodeSet u31 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.1.txt", false);
|
||||
|
||||
log.println();
|
||||
log.println("# Code points assigned in Unicode 1.1 (minus Hangul Syllables): "
|
||||
+ n.format(u11.count()));
|
||||
log.println();
|
||||
u11.print(log, false, false, "1.1");
|
||||
|
||||
UnicodeSet u20m = new UnicodeSet(u20).remove(u11);
|
||||
log.println();
|
||||
log.println("# Code points assigned in Unicode 2.0 (minus Unicode 1.1): "
|
||||
+ n.format(u20m.count()));
|
||||
log.println();
|
||||
u20m.print(log, false, false, "2.0");
|
||||
|
||||
UnicodeSet u21m = new UnicodeSet(u21).remove(u20);
|
||||
log.println();
|
||||
log.println("# Code points assigned in Unicode 2.1 (minus Unicode 2.0): "
|
||||
+ n.format(u21m.count()));
|
||||
log.println();
|
||||
u21m.print(log, false, false, "2.1");
|
||||
|
||||
UnicodeSet u30m = new UnicodeSet(u30).remove(u21);
|
||||
log.println();
|
||||
log.println("# Code points assigned in Unicode 3.0 (minus Unicode 2.1): "
|
||||
+ n.format(u30m.count()));
|
||||
log.println();
|
||||
u30m.print(log, false, false, "3.0");
|
||||
|
||||
UnicodeSet u31m = new UnicodeSet(u31).remove(u30);
|
||||
log.println();
|
||||
log.println("# Code points assigned in Unicode 3.1 (minus Unicode 3.0): "
|
||||
+ n.format(u31m.count()));
|
||||
log.println();
|
||||
u31m.print(log, false, false, "3.1");
|
||||
*/
|
||||
} finally {
|
||||
if (log != null) log.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
667
tools/unicodetools/com/ibm/text/UCD/GenerateData.java
Normal file
667
tools/unicodetools/com/ibm/text/UCD/GenerateData.java
Normal file
|
@ -0,0 +1,667 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
import java.text.DateFormat;
|
||||
import java.text.SimpleDateFormat;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public class GenerateData implements UCD_Types {
|
||||
|
||||
public static void main (String[] args) throws IOException {
|
||||
System.out.println("START");
|
||||
ucd = UCD.make();
|
||||
System.out.println("Loaded UCD " + ucd.getVersion() + " " + (new Date(ucd.getDate())));
|
||||
String version = ucd.getVersion();
|
||||
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
String arg = args[i];
|
||||
if (arg.charAt(0) == '#') return; // skip rest of line
|
||||
int mask = 0;
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("Argument: " + args[i]);
|
||||
|
||||
if (arg.equalsIgnoreCase("version")) {
|
||||
version = args[++i];
|
||||
ucd = UCD.make(version);
|
||||
} else if (arg.equalsIgnoreCase("partition")) {
|
||||
partitionProperties();
|
||||
} else if (arg.equalsIgnoreCase("list")) {
|
||||
listProperties();
|
||||
} else if (arg.equalsIgnoreCase("diff")) {
|
||||
listDifferences();
|
||||
} else if (arg.equalsIgnoreCase("DerivedBidiClass")) {
|
||||
generateVerticalSlice(BIDI_CLASS, BIDI_CLASS+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedBidiClass-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedNormalizationProperties")) {
|
||||
mask = Utility.setBits(0, DerivedPropertyLister.FC_NFKC_Closure, DerivedPropertyLister.ExpandsOnNFKC);
|
||||
mask = Utility.clearBit(mask, DerivedPropertyLister.FullCompInclusion);
|
||||
generateDerived(mask, HEADER_DERIVED, "DerivedNormalizationProperties-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedEastAsianWidth")) {
|
||||
generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedEastAsianWidth-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedGeneralCategory")) {
|
||||
generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedGeneralCategory-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedCombiningClass")) {
|
||||
generateVerticalSlice(COMBINING_CLASS, COMBINING_CLASS+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedCombiningClass-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedDecompositionType")) {
|
||||
generateVerticalSlice(DECOMPOSITION_TYPE, DECOMPOSITION_TYPE+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedDecompositionType-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedNumericType")) {
|
||||
generateVerticalSlice(NUMERIC_TYPE, NUMERIC_TYPE+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedNumericType-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedEastAsianWidth")) {
|
||||
generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedEastAsianWidth-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedJoiningType")) {
|
||||
generateVerticalSlice(JOINING_TYPE, JOINING_TYPE+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedJoiningType-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedJoiningGroup")) {
|
||||
generateVerticalSlice(JOINING_GROUP, JOINING_GROUP+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedJoiningGroup-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedBinaryProperties")) {
|
||||
generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES+1, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedBinaryProperties-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedNumericValues")) {
|
||||
generateVerticalSlice(LIMIT_ENUM, LIMIT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedNumericValues-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedCoreProperties")) {
|
||||
mask = Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf);
|
||||
generateDerived(mask, HEADER_DERIVED, "DerivedCoreProperties-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedLineBreak")) {
|
||||
generateVerticalSlice(LINE_BREAK, LINE_BREAK+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedLineBreak-" + version );
|
||||
} else if (arg.equalsIgnoreCase("Scripts")) {
|
||||
generateVerticalSlice(SCRIPT+1, SCRIPT + NEXT_ENUM, KEEP_SPECIAL, HEADER_SCRIPTS, "Scripts-");
|
||||
} else if (arg.equalsIgnoreCase("PropList")) {
|
||||
generateVerticalSlice(BINARY_PROPERTIES + White_space, BINARY_PROPERTIES + Noncharacter_Code_Point + 1,
|
||||
KEEP_SPECIAL, HEADER_EXTEND, "PropList-" + version);
|
||||
} else if (arg.equalsIgnoreCase("AllBinary")) {
|
||||
generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES + NEXT_ENUM,
|
||||
KEEP_SPECIAL, HEADER_EXTEND, "AllBinary-" + version);
|
||||
} else if (arg.equalsIgnoreCase("NormalizationTest")) {
|
||||
writeNormalizerTestSuite("NormalizationTest-" + version + ".txt" );
|
||||
} else if (arg.equalsIgnoreCase("generateCompExclusions")) {
|
||||
generateCompExclusions();
|
||||
}else {
|
||||
System.out.println(" ! Unknown option -- must be one of the following (case-insensitive)");
|
||||
System.out.println(" ! generateCompExclusions,...");
|
||||
}
|
||||
|
||||
|
||||
//checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F");
|
||||
//checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD");
|
||||
|
||||
|
||||
//generateDerived(Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf),
|
||||
// HEADER_DERIVED, "DerivedPropData2-" + version );
|
||||
//generateVerticalSlice(SCRIPT, SCRIPT+1, KEEP_SPECIAL, "ScriptCommon-" + version );
|
||||
//listStrings("LowerCase-" + version , 0,0);
|
||||
//generateVerticalSlice(0, LIMIT_ENUM, SKIP_SPECIAL, PROPLIST1, "DerivedPropData1-" + version );
|
||||
|
||||
// AGE stuff
|
||||
//UCD ucd = UCD.make();
|
||||
//System.out.println(ucd.getAgeID(0x61));
|
||||
//System.out.println(ucd.getAgeID(0x2FA1D));
|
||||
|
||||
//
|
||||
}
|
||||
System.out.println("END");
|
||||
}
|
||||
|
||||
static Normalizer nfkc = new Normalizer(Normalizer.NFKC);
|
||||
|
||||
public static void checkHoffman(String test) {
|
||||
String result = nfkc.normalize(test);
|
||||
System.out.println(Utility.hex(test) + " => " + Utility.hex(result));
|
||||
System.out.println();
|
||||
show(test, 0);
|
||||
System.out.println();
|
||||
show(result, 0);
|
||||
}
|
||||
|
||||
public static void show(String s, int indent) {
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
String cc = " " + ucd.getCombiningClass(cp);
|
||||
cc = Utility.repeat(" ", 4 - cc.length()) + cc;
|
||||
System.out.println(Utility.repeat(" ", indent) + ucd.getCode(cp) + cc + " " + ucd.getName(cp));
|
||||
String decomp = nfkc.normalize(cp);
|
||||
if (!decomp.equals(UTF32.valueOf32(cp))) {
|
||||
show(decomp, indent + 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd' 'HH:mm:ss.S' GMT'");
|
||||
|
||||
static {
|
||||
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
|
||||
}
|
||||
|
||||
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
|
||||
|
||||
public static String fixFile(String s) {
|
||||
int len = s.length();
|
||||
if (!s.endsWith(".txt")) return s;
|
||||
if (s.charAt(len-6) != 'd') return s;
|
||||
char c = s.charAt(len-5);
|
||||
if (c < '0' || '9' < c) return s;
|
||||
System.out.println("Fixing File Name");
|
||||
return s.substring(0,len-6) + s.substring(len-4);
|
||||
}
|
||||
|
||||
static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2;
|
||||
|
||||
public static void doHeader(String fileName, PrintStream output, int headerChoice) {
|
||||
output.println("# " + fileName + ".txt");
|
||||
output.println("#");
|
||||
if (headerChoice == HEADER_SCRIPTS) {
|
||||
output.println("# For documentation, see UTR #24: Script Names");
|
||||
output.println("# http://www.unicode.org/unicode/reports/tr24/");
|
||||
} else if (headerChoice == HEADER_EXTEND) {
|
||||
output.println("# Unicode Character Database: Extended Properties");
|
||||
output.println("# For documentation, see PropList.html");
|
||||
} else {
|
||||
output.println("# Unicode Character Database: Derived Property Data");
|
||||
output.println("# Generated algorithmically from the Unicode Character Database");
|
||||
output.println("# For documentation, see DerivedProperties.html");
|
||||
}
|
||||
output.println("# Date: " + myDateFormat.format(new Date()) + " [MD]");
|
||||
output.println("# Note: Unassigned and Noncharacter codepoints are omitted,");
|
||||
output.println("# except when listing Noncharacter or Cn.");
|
||||
output.println("# ================================================");
|
||||
output.println();
|
||||
}
|
||||
|
||||
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
|
||||
ucd = UCD.make("310");
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + fileName));
|
||||
doHeader(fileName, output, headerChoice);
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
if ((bitMask & (1<<i)) == 0) continue;
|
||||
if (i >= DerivedPropertyLister.LIMIT) break;
|
||||
System.out.print('.');
|
||||
output.println("# ================================================");
|
||||
output.println();
|
||||
new DerivedPropertyLister(ucd, i, output).print();
|
||||
}
|
||||
output.close();
|
||||
}
|
||||
|
||||
/*
|
||||
public static void listStrings(String file, int type, int subtype) throws IOException {
|
||||
ucd = UCD.make("310");
|
||||
UCD ucd30 = UCD.make("300");
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
|
||||
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if ((i & 0xFFF) == 0) System.out.println("# " + i);
|
||||
if (!ucd.isRepresented(i)) continue;
|
||||
if (ucd30.isRepresented(i)) continue;
|
||||
String string = "";
|
||||
switch(type) {
|
||||
case 0: string = ucd.getSimpleLowercase(i);
|
||||
}
|
||||
if (UTF32.length32(string) == 1 && UTF32.char32At(string,0) == i) continue;
|
||||
output.println(Utility.hex(i) + "; C; " + Utility.hex(string) + "; # " + ucd.getName(i));
|
||||
}
|
||||
output.close();
|
||||
}
|
||||
*/
|
||||
|
||||
public static void generateCompExclusions() throws IOException {
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "CompositionExclusionsDelta.txt"));
|
||||
new CompLister(output).print();
|
||||
output.close();
|
||||
}
|
||||
|
||||
static class CompLister extends PropertyLister {
|
||||
UCD oldUCD;
|
||||
int oldLength = 0;
|
||||
|
||||
public CompLister(PrintStream output) {
|
||||
this.output = output;
|
||||
ucdData = UCD.make("310");
|
||||
oldUCD = UCD.make("300");
|
||||
showOnConsole = true;
|
||||
}
|
||||
public String propertyName(int cp) {
|
||||
return UTF32.length32(ucdData.getDecompositionMapping(cp)) + "";
|
||||
}
|
||||
public byte status(int cp) {
|
||||
if (ucdData.getDecompositionType(cp) == CANONICAL
|
||||
&& oldUCD.getDecompositionType(cp) != CANONICAL) {
|
||||
int temp = oldLength;
|
||||
oldLength = UTF32.length32(ucdData.getDecompositionMapping(cp));
|
||||
if (temp != oldLength) return BREAK;
|
||||
return INCLUDE;
|
||||
}
|
||||
return EXCLUDE;
|
||||
}
|
||||
}
|
||||
|
||||
public static void partitionProperties() throws IOException {
|
||||
|
||||
// find properties
|
||||
|
||||
int count = 0;
|
||||
int[] props = new int[500];
|
||||
for (int i = 1; i < LIMIT_ENUM; ++i) { // || iType == SCRIPT
|
||||
int iType = i & 0xFF00;
|
||||
if (iType == JOINING_GROUP || iType == AGE || iType == COMBINING_CLASS) continue;
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
|
||||
props[count++] = i;
|
||||
}
|
||||
System.out.println("props: " + count);
|
||||
|
||||
BitSet probe = new BitSet();
|
||||
Map map = new HashMap();
|
||||
int total = 0;
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
int cat = ucd.getCategory(cp);
|
||||
if (cat == UNASSIGNED || cat == PRIVATE_USE || cat == SURROGATE) continue;
|
||||
if (!ucd.isAllocated(cp)) continue;
|
||||
|
||||
for (int i = 0; i < count; ++i) {
|
||||
boolean iProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, props[i]);
|
||||
if (iProp) probe.set(i); else probe.clear(i);
|
||||
}
|
||||
|
||||
++total;
|
||||
if (!map.containsKey(probe)) {
|
||||
map.put(probe.clone(), UTF32.valueOf32(cp));
|
||||
Utility.fixDot();
|
||||
System.out.println("Set Size: " + map.size() + ", total: " + total + ", " + ucd.getCodeAndName(cp));
|
||||
}
|
||||
}
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("Set Size: " + map.size());
|
||||
}
|
||||
|
||||
public static void listDifferences() throws IOException {
|
||||
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "PropertyDifferences.txt"));
|
||||
|
||||
for (int i = 1; i < LIMIT_ENUM; ++i) {
|
||||
int iType = i & 0xFF00;
|
||||
if (iType == JOINING_GROUP || iType == AGE || iType == COMBINING_CLASS || iType == SCRIPT) continue;
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
|
||||
String iNameShort = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.SHORT);
|
||||
String iNameLong = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.LONG);
|
||||
|
||||
System.out.println();
|
||||
System.out.println();
|
||||
System.out.println(iNameLong);
|
||||
output.println("#" + iNameLong);
|
||||
|
||||
int last = -1;
|
||||
for (int j = i+1; j < LIMIT_ENUM; ++j) {
|
||||
int jType = j & 0xFF00;
|
||||
if (jType == JOINING_GROUP || jType == AGE || jType == COMBINING_CLASS || jType == SCRIPT
|
||||
|| (jType == iType && jType != BINARY_PROPERTIES)) continue;
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, j)) continue;
|
||||
|
||||
if ((j >> 8) != last) {
|
||||
last = j >> 8;
|
||||
System.out.println();
|
||||
System.out.print("\t" + UCD_Names.SHORT_UNIFIED_PROPERTIES[last]);
|
||||
output.flush();
|
||||
output.println("#\t" + UCD_Names.SHORT_UNIFIED_PROPERTIES[last]);
|
||||
} else {
|
||||
System.out.print('.');
|
||||
}
|
||||
System.out.flush();
|
||||
|
||||
int bothCount = 0, i_jPropCount = 0, j_iPropCount = 0, iCount = 0, jCount = 0;
|
||||
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
int cat = ucd.getCategory(cp);
|
||||
if (cat == UNASSIGNED || cat == PRIVATE_USE || cat == SURROGATE) continue;
|
||||
if (!ucd.isAllocated(cp)) continue;
|
||||
|
||||
boolean iProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, i);
|
||||
boolean jProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, j);
|
||||
|
||||
if (jProp) ++jCount;
|
||||
if (iProp) {
|
||||
++iCount;
|
||||
if (jProp) ++bothCount;
|
||||
else ++i_jPropCount;
|
||||
} else if (jProp) ++j_iPropCount;
|
||||
}
|
||||
if (iCount == 0 || jCount == 0) continue;
|
||||
|
||||
String jNameShort = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, j, MyPropertyLister.SHORT);
|
||||
//String jNameLong = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, j, MyPropertyLister.LONG);
|
||||
|
||||
String rel = bothCount == 0 ? "DISJOINT"
|
||||
: i_jPropCount == 0 && j_iPropCount == 0 ? "EQUALS"
|
||||
: i_jPropCount == 0 ? "CONTAINS" // depends on reverse output
|
||||
: j_iPropCount == 0 ? "CONTAINS"
|
||||
: "OVERLAPS";
|
||||
|
||||
if (j_iPropCount > i_jPropCount) {
|
||||
// reverse output
|
||||
output.println(jNameShort + "\t" + iNameShort + "\t" + rel
|
||||
+ "\t" + bothCount + "\t" + j_iPropCount + "\t" + i_jPropCount);
|
||||
} else {
|
||||
output.println(iNameShort + "\t" + jNameShort + "\t" + rel
|
||||
+ "\t" + bothCount + "\t" + i_jPropCount + "\t" + j_iPropCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
output.close();
|
||||
}
|
||||
|
||||
|
||||
public static void listProperties() {
|
||||
for (int i = 0; i < LIMIT_ENUM; ++i) {
|
||||
int type = i & 0xFF00;
|
||||
if (type == JOINING_GROUP || type == AGE) continue;
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
|
||||
String value = MyPropertyLister.getUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.LONG);
|
||||
if (value.length() == 0) value = "none";
|
||||
else if (value.equals("<unused>")) continue;
|
||||
String abbvalue = MyPropertyLister.getUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.SHORT);
|
||||
if (abbvalue.length() == 0) abbvalue = "no";
|
||||
|
||||
if (type == COMBINING_CLASS) {
|
||||
value = MyPropertyLister.getCombiningName(i);
|
||||
if (value.length() == 0) {
|
||||
if ((i & 0xFF) == 0) value = "99";
|
||||
else continue;
|
||||
}
|
||||
abbvalue = value;
|
||||
}
|
||||
|
||||
String elide = "";
|
||||
if (type == CATEGORY || type == SCRIPT || type == BINARY_PROPERTIES) elide = "\\p{"
|
||||
+ abbvalue
|
||||
+ "}";
|
||||
String abb = "";
|
||||
if (type != BINARY_PROPERTIES) abb = "\\p{"
|
||||
+ UCD_Names.ABB_UNIFIED_PROPERTIES[i>>8]
|
||||
+ "="
|
||||
+ abbvalue
|
||||
+ "}";
|
||||
String norm = "";
|
||||
if (type != BINARY_PROPERTIES) norm = "\\p{"
|
||||
+ UCD_Names.SHORT_UNIFIED_PROPERTIES[i>>8]
|
||||
+ "="
|
||||
+ value
|
||||
+ "}";
|
||||
System.out.println("<tr><td>" + elide + "</td><td>" + abb + "</td><td>" + norm + "</td></tr>");
|
||||
}
|
||||
}
|
||||
|
||||
static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
|
||||
|
||||
public static void generateVerticalSlice(int startEnum, int endEnum, byte skipSpecial,
|
||||
int headerChoice, String file) throws IOException {
|
||||
|
||||
//System.out.println(ucd.toString(0x1E0A));
|
||||
/*
|
||||
System.out.println(ucd.getData(0xFFFF));
|
||||
System.out.println(ucd.getData(0x100000));
|
||||
System.out.println(ucd.getData(0x100000-1));
|
||||
System.out.println(ucd.getData(0x100000-2));
|
||||
System.out.println(ucd.getData(0x100000-3));
|
||||
if (true) return;
|
||||
String test2 = ucd.getName(0x2A6D6);
|
||||
//*/
|
||||
|
||||
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file + "dX.txt"));
|
||||
doHeader(file, output, headerChoice);
|
||||
int last = -1;
|
||||
for (int i = startEnum; i < endEnum; ++i) {
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
|
||||
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|
||||
|| i == (BINARY_PROPERTIES | Non_break)
|
||||
|| i == (JOINING_TYPE | JT_U)
|
||||
|| i == (JOINING_GROUP | NO_SHAPING)
|
||||
) continue; // skip zero case
|
||||
if (skipSpecial == SKIP_SPECIAL
|
||||
&& i >= (BINARY_PROPERTIES | CompositionExclusion)
|
||||
&& i < (AGE + NEXT_ENUM)) continue;
|
||||
if ((last & 0xFF00) != (i & 0xFF00) && (i <= BINARY_PROPERTIES || i >= SCRIPT)) {
|
||||
output.println();
|
||||
output.println("# ================================================");
|
||||
output.println("# " + UCD_Names.UNIFIED_PROPERTIES[i>>8]);
|
||||
output.println("# ================================================");
|
||||
output.println();
|
||||
System.out.println();
|
||||
System.out.println(UCD_Names.UNIFIED_PROPERTIES[i>>8]);
|
||||
last = i;
|
||||
} else {
|
||||
output.println("# ================================================");
|
||||
output.println();
|
||||
}
|
||||
System.out.print(".");
|
||||
new MyPropertyLister(ucd, i, output).print();
|
||||
}
|
||||
if (endEnum == LIMIT_ENUM) {
|
||||
output.println();
|
||||
output.println("# ================================================");
|
||||
output.println("# Numeric Values (from UnicodeData.txt, field 6/7/8)");
|
||||
output.println("# ================================================");
|
||||
output.println();
|
||||
System.out.println();
|
||||
System.out.println("@NUMERIC VALUES");
|
||||
|
||||
Set floatSet = new TreeSet();
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
float nv = ucd.getNumericValue(i);
|
||||
if (Float.isNaN(nv)) continue;
|
||||
floatSet.add(new Float(nv));
|
||||
}
|
||||
Iterator it = floatSet.iterator();
|
||||
while(it.hasNext()) {
|
||||
new MyFloatLister(ucd, ((Float)it.next()).floatValue(), output).print();
|
||||
output.println();
|
||||
System.out.print(".");
|
||||
}
|
||||
}
|
||||
output.close();
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
static UCD ucd;
|
||||
|
||||
static public Normalizer formC, formD, formKC, formKD;
|
||||
|
||||
static public void writeNormalizerTestSuite(String fileName) throws IOException {
|
||||
ucd = UCD.make();
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter(fileName);
|
||||
|
||||
formC = new Normalizer(Normalizer.NFC);
|
||||
formD = new Normalizer(Normalizer.NFD);
|
||||
formKC = new Normalizer(Normalizer.NFKC);
|
||||
formKD = new Normalizer(Normalizer.NFKD);
|
||||
|
||||
String[] example = new String[256];
|
||||
|
||||
log.println("# " + fixFile(fileName));
|
||||
log.println("#");
|
||||
log.println("# Normalization Test Suite");
|
||||
log.println("# Date: " + myDateFormat.format(new Date()) + " [MD]");
|
||||
log.println("# Format:");
|
||||
log.println("#");
|
||||
log.println("# Columns (c1, c2,...) are separated by semicolons");
|
||||
log.println("# Comments are indicated with hash marks");
|
||||
log.println("#");
|
||||
log.println("# CONFORMANCE:");
|
||||
log.println("# 1. The following invariants must be true for all conformant implementations");
|
||||
log.println("#");
|
||||
log.println("# NFC");
|
||||
log.println("# c2 == NFC(c1) == NFC(c2) == NFC(c3)");
|
||||
log.println("# c4 == NFC(c4) == NFC(c5)");
|
||||
log.println("#");
|
||||
log.println("# NFD");
|
||||
log.println("# c3 == NFD(c1) == NFD(c2) == NFD(c3)");
|
||||
log.println("# c5 == NFD(c4) == NFD(c5");
|
||||
log.println("#");
|
||||
log.println("# NFKC");
|
||||
log.println("# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)");
|
||||
log.println("#");
|
||||
log.println("# NFKD");
|
||||
log.println("# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)");
|
||||
log.println("#");
|
||||
log.println("# 2. For every assigned Unicode 3.1.0 code point X that is not specifically");
|
||||
log.println("# listed in Part 1, the following invariants must be true for all conformant");
|
||||
log.println("# implementations:");
|
||||
log.println("#");
|
||||
log.println("# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)");
|
||||
|
||||
System.out.println("Writing Part 1");
|
||||
|
||||
log.println("#");
|
||||
log.println("@Part0 # Specific cases");
|
||||
log.println("#");
|
||||
|
||||
for (int j = 0; j < testSuiteCases.length; ++j) {
|
||||
writeLine(testSuiteCases[j], log, false);
|
||||
}
|
||||
|
||||
System.out.println("Writing Part 2");
|
||||
|
||||
log.println("#");
|
||||
log.println("@Part1 # Character by character test");
|
||||
log.println("# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.");
|
||||
log.println("#");
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
if (!ucd.isAssigned(ch)) continue;
|
||||
if (ucd.isPUA(ch)) continue;
|
||||
String cc = UTF32.valueOf32(ch);
|
||||
writeLine(cc,log, true);
|
||||
}
|
||||
Utility.fixDot();
|
||||
|
||||
System.out.println("Finding Examples");
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
if (!ucd.isAssigned(ch)) continue;
|
||||
if (ucd.isPUA(ch)) continue;
|
||||
int cc = ucd.getCombiningClass(ch);
|
||||
if (example[cc] == null) example[cc] = UTF32.valueOf32(ch);
|
||||
}
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("Writing Part 2");
|
||||
|
||||
log.println("#");
|
||||
log.println("@Part2 # Canonical Order Test");
|
||||
log.println("#");
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
|
||||
Utility.dot(ch);
|
||||
if (!ucd.isAssigned(ch)) continue;
|
||||
if (ucd.isPUA(ch)) continue;
|
||||
short c = ucd.getCombiningClass(ch);
|
||||
if (c == 0) continue;
|
||||
|
||||
// add character with higher class, same class, lower class
|
||||
|
||||
String sample = "";
|
||||
for (int i = c+1; i < example.length; ++i) {
|
||||
if (example[i] == null) continue;
|
||||
sample += example[i];
|
||||
break;
|
||||
}
|
||||
sample += example[c];
|
||||
for (int i = c-1; i > 0; --i) {
|
||||
if (example[i] == null) continue;
|
||||
sample += example[i];
|
||||
break;
|
||||
}
|
||||
|
||||
writeLine("a" + sample + UTF32.valueOf32(ch) + "b", log, false);
|
||||
writeLine("a" + UTF32.valueOf32(ch) + sample + "b", log, false);
|
||||
}
|
||||
Utility.fixDot();
|
||||
log.println("#");
|
||||
log.println("# END OF FILE");
|
||||
log.close();
|
||||
}
|
||||
|
||||
static void writeLine(String cc, PrintWriter log, boolean check) {
|
||||
String c = formC.normalize(cc);
|
||||
String d = formD.normalize(cc);
|
||||
String kc = formKC.normalize(cc);
|
||||
String kd = formKD.normalize(cc);
|
||||
if (check & cc.equals(c) && cc.equals(d) && cc.equals(kc) && cc.equals(kd)) return;
|
||||
|
||||
// consistency check
|
||||
String dc = formD.normalize(c);
|
||||
String dkc = formD.normalize(kc);
|
||||
if (!dc.equals(d) || !dkc.equals(kd)) {
|
||||
System.out.println("Danger Will Robinson!");
|
||||
Normalizer.SHOW_PROGRESS = true;
|
||||
d = formD.normalize(cc);
|
||||
}
|
||||
|
||||
// printout
|
||||
log.println(
|
||||
Utility.hex(cc," ") + ";" + Utility.hex(c," ") + ";" + Utility.hex(d," ") + ";"
|
||||
+ Utility.hex(kc," ") + ";" + Utility.hex(kd," ")
|
||||
+ "; # ("
|
||||
+ comma(cc) + "; " + comma(c) + "; " + comma(d) + "; " + comma(kc) + "; " + comma(kd) + "; "
|
||||
+ ") " + ucd.getName(cc));
|
||||
}
|
||||
|
||||
static StringBuffer commaResult = new StringBuffer();
|
||||
|
||||
// not recursive!!!
|
||||
static final String comma(String s) {
|
||||
commaResult.setLength(0);
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(i)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (ucd.getCategory(cp) == Mn) commaResult.append('\u25CC');
|
||||
UTF32.append32(commaResult, cp);
|
||||
}
|
||||
return commaResult.toString();
|
||||
}
|
||||
|
||||
static final String[] testSuiteCases = {
|
||||
"\u1E0A",
|
||||
"\u1E0C",
|
||||
"\u1E0A\u0323",
|
||||
"\u1E0C\u0307",
|
||||
"D\u0307\u0323",
|
||||
"D\u0323\u0307",
|
||||
"\u1E0A\u031B",
|
||||
"\u1E0C\u031B",
|
||||
"\u1E0A\u031B\u0323",
|
||||
"\u1E0C\u031B\u0307",
|
||||
"D\u031B\u0307\u0323",
|
||||
"D\u031B\u0323\u0307",
|
||||
"\u00C8",
|
||||
"\u0112",
|
||||
"E\u0300",
|
||||
"E\u0304",
|
||||
"\u1E14",
|
||||
"\u0112\u0300",
|
||||
"\u1E14\u0304",
|
||||
"E\u0304\u0300",
|
||||
"E\u0300\u0304",
|
||||
"\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F",
|
||||
"\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD"
|
||||
|
||||
};
|
||||
|
||||
}
|
314
tools/unicodetools/com/ibm/text/UCD/MLStreamWriter.java
Normal file
314
tools/unicodetools/com/ibm/text/UCD/MLStreamWriter.java
Normal file
|
@ -0,0 +1,314 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import com.ibm.text.UCD.*;
|
||||
|
||||
public class MLStreamWriter extends Writer {
|
||||
public static final String copyright =
|
||||
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
|
||||
|
||||
public MLStreamWriter (PrintWriter output, boolean HTML) {
|
||||
out = output;
|
||||
isHTML = HTML;
|
||||
}
|
||||
|
||||
public MLStreamWriter (PrintWriter output) {
|
||||
this(output,true);
|
||||
}
|
||||
|
||||
public MLStreamWriter el(String elementName) {
|
||||
closeIfOpen();
|
||||
print('<', AFTER);
|
||||
print(elementName, elementName.equals("!--") ? AFTER+FORCE : AFTER);
|
||||
stack.add(elementName);
|
||||
inElement = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
private MLStreamWriter closeIfOpen() {
|
||||
if (inElement && !"!--".equals(stack.get(stack.size()-1))) {
|
||||
print('>',BEFORE+FORCE);
|
||||
}
|
||||
inElement = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
final public MLStreamWriter cel(String elementName) {
|
||||
return cl().tx(elementName);
|
||||
}
|
||||
|
||||
public MLStreamWriter at(String attributeName, String attributeValue) {
|
||||
if (!inElement) {
|
||||
throw new IllegalArgumentException("attribute \"" + attributeName + "\" not in element");
|
||||
}
|
||||
print(' ', BOTH);
|
||||
print(attributeName, AFTER);
|
||||
print('=', AFTER);
|
||||
print('"');
|
||||
print(quoted(attributeValue));
|
||||
print('"', AFTER);
|
||||
return this;
|
||||
}
|
||||
|
||||
public MLStreamWriter at(String attributeName, int value) {
|
||||
return at(attributeName, String.valueOf(value));
|
||||
}
|
||||
|
||||
public MLStreamWriter CR() {
|
||||
closeIfOpen();
|
||||
out.println();
|
||||
return this;
|
||||
}
|
||||
|
||||
/*public MLStreamWriter comment() {
|
||||
closeIfOpen();
|
||||
print("<!--");
|
||||
CR();
|
||||
return this;
|
||||
}
|
||||
|
||||
public MLStreamWriter endComment() {
|
||||
print("-->");
|
||||
return this;
|
||||
}
|
||||
*/
|
||||
|
||||
public MLStreamWriter tx(String text) {
|
||||
closeIfOpen();
|
||||
print(quoted(text));
|
||||
return this;
|
||||
}
|
||||
|
||||
final public MLStreamWriter tx(char text) {
|
||||
return tx(String.valueOf(text));
|
||||
}
|
||||
|
||||
final public MLStreamWriter tx(int text) {
|
||||
return tx(String.valueOf(text));
|
||||
}
|
||||
|
||||
final public MLStreamWriter tx16(String text) {
|
||||
return tx(hex(text));
|
||||
}
|
||||
|
||||
final public MLStreamWriter tx16(char text) {
|
||||
return tx(hex(text));
|
||||
}
|
||||
|
||||
final public MLStreamWriter tx16(int text) {
|
||||
return tx(hex(text));
|
||||
}
|
||||
|
||||
public MLStreamWriter cl(String closingElement) {
|
||||
closeIfOpen();
|
||||
String lastElement = (String)stack.remove(stack.size()-1);
|
||||
if (closingElement != null && !closingElement.equals(lastElement)) {
|
||||
throw new IllegalArgumentException("mismatch when closing \"" + closingElement
|
||||
+ "\", current active element is \"" + lastElement + "\"");
|
||||
}
|
||||
if (lastElement.equals("!--")) {// hack for XML/HTML
|
||||
print("-->",BEFORE+FORCE);
|
||||
} else {
|
||||
print("</");
|
||||
print(lastElement);
|
||||
print('>',BEFORE);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
final public MLStreamWriter cl() {
|
||||
return cl(null);
|
||||
}
|
||||
|
||||
public MLStreamWriter closeAllElements() {
|
||||
for (int i = stack.size()-1; i >= 0; --i) {
|
||||
cl(null);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
// stream stuff
|
||||
|
||||
public void write(char[] source, int start, int len) {
|
||||
closeIfOpen();
|
||||
// later make more efficient!!
|
||||
out.print(quoted(new String(source, start, len)));
|
||||
}
|
||||
|
||||
public void close() {
|
||||
closeAllElements();
|
||||
out.close();
|
||||
}
|
||||
|
||||
public void flush() {
|
||||
out.flush();
|
||||
}
|
||||
|
||||
// Utility methods
|
||||
|
||||
final public MLStreamWriter cell(String ch, String type, String codepoint, String cat) {
|
||||
if (codepoint == null) codepoint = ch;
|
||||
int dotpos = type.indexOf('.');
|
||||
if (dotpos == -1) el(type);
|
||||
else {
|
||||
el(type.substring(0,dotpos));
|
||||
at("class",type.substring(dotpos+1));
|
||||
}
|
||||
/*
|
||||
if (color == -1) {
|
||||
el("th");
|
||||
} else {
|
||||
el("td");
|
||||
if (color != 0xFFFFFF) {
|
||||
at("bgcolor","#"+hex(color,6));
|
||||
}
|
||||
}
|
||||
*/
|
||||
tx(ch).el("br").el("tt").tx16(codepoint);
|
||||
if (cat != null) tx(" ").tx(cat);
|
||||
cl().cl().cl();
|
||||
return this;
|
||||
}
|
||||
|
||||
final public MLStreamWriter cell(String ch) {
|
||||
return cell(ch,"td",null,null);
|
||||
}
|
||||
|
||||
final public MLStreamWriter cell(String ch, String type) {
|
||||
return cell(ch,type,null,null);
|
||||
}
|
||||
|
||||
final public MLStreamWriter cell(String ch, String type, String codepoint) {
|
||||
return cell(ch,type,codepoint,null);
|
||||
}
|
||||
|
||||
static public String hex(int i, int width) {
|
||||
String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase();
|
||||
return "00000000".substring(result.length(),width) + result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Supplies a zero-padded hex representation of an integer (without 0x)
|
||||
*/
|
||||
static public String hex(int i) {
|
||||
return hex(i,8);
|
||||
}
|
||||
|
||||
/**
|
||||
* Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
|
||||
*/
|
||||
static public String hex(char i) {
|
||||
return hex(i,4);
|
||||
}
|
||||
|
||||
/**
|
||||
* Supplies a zero-padded hex representation of a Unicode String (without 0x, \\u)
|
||||
*@param sep can be used to give a sequence, e.g. hex("ab", ",") gives "0061,0062"
|
||||
*/
|
||||
static public String hex(String s, String sep) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
if (i != 0) result.append(sep);
|
||||
result.append(hex(s.charAt(i)));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
static public String hex(String s) {
|
||||
return hex(s," ");
|
||||
}
|
||||
|
||||
|
||||
public void author(String name, String url) {
|
||||
el("font").at("size","-3").tx("[").el("a").at("href",url).tx(name).cl("a").el("script").el("!--");
|
||||
tx("document.write(', ', document.lastModified);");
|
||||
cl("!--").cl("script").tx("]").cl("font");
|
||||
}
|
||||
|
||||
// ================== PRIVATES =================
|
||||
|
||||
PrintWriter out;
|
||||
boolean isHTML;
|
||||
ArrayList stack = new ArrayList();
|
||||
boolean inElement = false;
|
||||
Normalizer formC = new Normalizer(Normalizer.NFC);
|
||||
int len;
|
||||
int maxLineLength = 60;
|
||||
// later, add better line end management, indenting
|
||||
|
||||
static final int NONE=0, BEFORE=1, AFTER=2, BOTH=3, FORCE = 4; // chosen for bits!!
|
||||
|
||||
final void print(String s) {
|
||||
print(s,NONE);
|
||||
}
|
||||
|
||||
final void print(char c) {
|
||||
print(c,NONE);
|
||||
}
|
||||
|
||||
final void print(String s, int doesBreak) {
|
||||
if ((doesBreak & BEFORE) != 0) tryBreak(s.length(), doesBreak);
|
||||
len += s.length();
|
||||
out.print(s);
|
||||
if ((doesBreak & AFTER) != 0) tryBreak(0, doesBreak);
|
||||
}
|
||||
|
||||
final void print(char c, int doesBreak) {
|
||||
if ((doesBreak & BEFORE) != 0) tryBreak(1, doesBreak);
|
||||
++len;
|
||||
out.print(c);
|
||||
if ((doesBreak & AFTER) != 0) tryBreak(0, doesBreak);
|
||||
}
|
||||
|
||||
void tryBreak(int toAdd, int doesBreak) {
|
||||
if ((doesBreak & FORCE) != 0 || (len + toAdd) > maxLineLength) {
|
||||
out.println();
|
||||
len = stack.size();
|
||||
for (int i = 0; i < len; ++i) out.print(' ');
|
||||
}
|
||||
}
|
||||
|
||||
public String quoted(String source) {
|
||||
source = formC.normalize(source);
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
char ch = source.charAt(i);
|
||||
switch(ch) {
|
||||
case '\'':
|
||||
if (!isHTML) {
|
||||
result.append("'");
|
||||
} else {
|
||||
result.append(ch);
|
||||
}
|
||||
break;
|
||||
case '\"':
|
||||
result.append(""");
|
||||
break;
|
||||
case '<':
|
||||
result.append("<");
|
||||
break;
|
||||
case '&':
|
||||
result.append("&");
|
||||
break;
|
||||
case '>':
|
||||
result.append(">");
|
||||
break;
|
||||
case '\n': case '\r': case '\t':
|
||||
result.append(ch);
|
||||
break;
|
||||
default: if (ch < ' ' // do surrogates later
|
||||
|| ch >= '\u007F' && ch <= '\u009F'
|
||||
|| ch >= '\uD800' && ch <= '\uDFFF'
|
||||
|| ch >= '\uFFFE') {
|
||||
result.append('\uFFFD');
|
||||
} else {
|
||||
result.append(ch);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
}
|
31
tools/unicodetools/com/ibm/text/UCD/MyFloatLister.java
Normal file
31
tools/unicodetools/com/ibm/text/UCD/MyFloatLister.java
Normal file
|
@ -0,0 +1,31 @@
|
|||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
|
||||
class MyFloatLister extends PropertyLister {
|
||||
private float propMask;
|
||||
|
||||
public MyFloatLister(UCD ucd, float f, PrintStream output) {
|
||||
this.propMask = f;
|
||||
this.output = output;
|
||||
this.ucdData = ucd;
|
||||
}
|
||||
|
||||
public String propertyName(int cp) {
|
||||
return ""+ucdData.getNumericValue(cp);
|
||||
}
|
||||
|
||||
public String optionalName(int cp) {
|
||||
return ucdData.getNumericTypeID(cp);
|
||||
}
|
||||
|
||||
public byte status(int cp) {
|
||||
//if ((cp & 0xFFF) == 0) System.out.println("# " + Utility.hex(cp));
|
||||
if (!ucdData.isRepresented(cp)) {
|
||||
if (ucdData.mapToRepresentative(cp, false) != cp) return PropertyLister.CONTINUE;
|
||||
return PropertyLister.CONTINUE;
|
||||
}
|
||||
if (ucdData.getCategory(cp) == Cn) return PropertyLister.CONTINUE;
|
||||
return ucdData.getNumericValue(cp) == propMask ? INCLUDE : EXCLUDE;
|
||||
}
|
||||
}
|
||||
|
270
tools/unicodetools/com/ibm/text/UCD/MyPropertyLister.java
Normal file
270
tools/unicodetools/com/ibm/text/UCD/MyPropertyLister.java
Normal file
|
@ -0,0 +1,270 @@
|
|||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
final class MyPropertyLister extends PropertyLister {
|
||||
|
||||
static final boolean BRIDGE = false;
|
||||
|
||||
private int propMask;
|
||||
|
||||
public MyPropertyLister(UCD ucd, int propMask, PrintStream output) {
|
||||
this.propMask = propMask;
|
||||
this.output = output;
|
||||
this.ucdData = ucd;
|
||||
if (propMask < COMBINING_CLASS) usePropertyComment = false; // skip gen cat
|
||||
}
|
||||
|
||||
static String getCombiningName (int propMask) {
|
||||
String s = "";
|
||||
switch (propMask & 0xFF) {
|
||||
case 0: s = "NotReordered"; break;
|
||||
case 1: s = "Overlay"; break;
|
||||
case 7: s = "Nukta"; break;
|
||||
case 8: s = "KanaVoicing"; break;
|
||||
case 9: s = "Virama"; break;
|
||||
case 202: s = "AttachedBelowLeft"; break;
|
||||
case 204: s = "AttachedBelow"; break;
|
||||
case 206: s = "AttachedBelowRight"; break;
|
||||
case 208: s = "AttachedLeft"; break;
|
||||
case 210: s = "AttachedRight"; break;
|
||||
case 212: s = "AttachedAboveLeft"; break;
|
||||
case 214: s = "AttachedAbove"; break;
|
||||
case 216: s = "AttachedAboveRight"; break;
|
||||
case 218: s = "BelowLeft"; break;
|
||||
case 220: s = "Below"; break;
|
||||
case 222: s = "BelowRight"; break;
|
||||
case 224: s = "Left"; break;
|
||||
case 226: s = "Right"; break;
|
||||
case 228: s = "AboveLeft"; break;
|
||||
case 230: s = "Above"; break;
|
||||
case 232: s = "AboveRight"; break;
|
||||
case 233: s = "DoubleBelow"; break;
|
||||
case 234: s = "DoubleAbove"; break;
|
||||
case 240: s = "IotaSubscript"; break;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
public String headerString() {
|
||||
int main = (propMask & 0xFF00);
|
||||
if (main == COMBINING_CLASS) {
|
||||
String s = getCombiningName(propMask);
|
||||
if (s.length() == 0) s = "Other Combining Class";
|
||||
return "# " + s;
|
||||
} else if (main == BINARY_PROPERTIES) {
|
||||
return "# Binary Property";
|
||||
} else if (main == JOINING_GROUP) {
|
||||
return "";
|
||||
} else {
|
||||
String shortID = getUnifiedBinaryPropertyID(ucdData, propMask, SHORT);
|
||||
String longID = getUnifiedBinaryPropertyID(ucdData, propMask, LONG);
|
||||
return "# " + shortID + (shortID.equals(longID) ? "" : "\t(" + longID + ")");
|
||||
}
|
||||
}
|
||||
|
||||
public String propertyName(int cp) {
|
||||
return getUnifiedBinaryPropertyID(propMask);
|
||||
}
|
||||
|
||||
public String optionalComment(int cp) {
|
||||
if (propMask < COMBINING_CLASS) return ""; // skip gen cat
|
||||
int cat = ucdData.getCategory(cp);
|
||||
if (cat == Lt || cat == Ll || cat == Lu) return "L&";
|
||||
return ucdData.getCategoryID(cp);
|
||||
}
|
||||
|
||||
/*
|
||||
public String optionalName(int cp) {
|
||||
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
|
||||
return Utility.hex(ucdData.getDecompositionMapping(cp));
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
public byte status(int cp) {
|
||||
//if (cp == 0xFFFF) {
|
||||
// System.out.println("# " + Utility.hex(cp));
|
||||
//}
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
//if (cp == 0x0385) {
|
||||
// System.out.println(Utility.hex(firstRealCp));
|
||||
//}
|
||||
|
||||
if (cat == Cn
|
||||
&& propMask != (BINARY_PROPERTIES | Noncharacter_Code_Point)
|
||||
&& propMask != (BINARY_PROPERTIES | Reserved_Cf_Code_Point)
|
||||
&& propMask != (CATEGORY | Cn)) {
|
||||
if (BRIDGE) return CONTINUE;
|
||||
else return EXCLUDE;
|
||||
}
|
||||
|
||||
boolean inSet = getUnifiedBinaryProperty(cp, propMask);
|
||||
/*
|
||||
if (cp >= 0x1D400 && cp <= 0x1D7C9 && cat != Cn) {
|
||||
if (propMask == (SCRIPT | LATIN_SCRIPT)) inSet = cp <= 0x1D6A3;
|
||||
else if (propMask == (SCRIPT | GREEK_SCRIPT)) inSet = cp > 0x1D6A3;
|
||||
}
|
||||
*/
|
||||
/* HACK
|
||||
1D400;MATHEMATICAL BOLD CAPITAL A;Lu;0;L;<font> 0041;;;;N;;;;;
|
||||
1D6A3;MATHEMATICAL MONOSPACE SMALL Z;Ll;0;L;<font> 007A;;;;N;;;;;
|
||||
1D6A8;MATHEMATICAL BOLD CAPITAL ALPHA;Lu;0;L;<font> 0391;;;;N;;;;;
|
||||
1D7C9;MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL;Ll;0;L;<font> 03D6;;;;N;;;;;
|
||||
*/
|
||||
|
||||
if (!inSet) return EXCLUDE;
|
||||
return INCLUDE;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return unified property number
|
||||
*/
|
||||
public static boolean isUnifiedBinaryPropertyDefined(UCD ucd, int propMask) {
|
||||
int enum = propMask >> 8;
|
||||
propMask &= 0xFF;
|
||||
switch (enum) {
|
||||
case CATEGORY>>8: return propMask != UNUSED_CATEGORY && propMask < LIMIT_CATEGORY;
|
||||
case COMBINING_CLASS>>8: return ucd.isCombiningClassUsed((byte)propMask);
|
||||
case BIDI_CLASS>>8: return propMask != BIDI_UNUSED && propMask < LIMIT_BIDI_CLASS;
|
||||
case DECOMPOSITION_TYPE>>8: return propMask < LIMIT_DECOMPOSITION_TYPE;
|
||||
case NUMERIC_TYPE>>8: return propMask < LIMIT_NUMERIC_TYPE;
|
||||
case EAST_ASIAN_WIDTH>>8: return propMask < LIMIT_EAST_ASIAN_WIDTH;
|
||||
case LINE_BREAK>>8: return propMask < LIMIT_LINE_BREAK;
|
||||
case JOINING_TYPE>>8: return propMask < LIMIT_JOINING_TYPE;
|
||||
case JOINING_GROUP>>8: return propMask < LIMIT_JOINING_GROUP;
|
||||
case BINARY_PROPERTIES>>8: return propMask < LIMIT_BINARY_PROPERTIES;
|
||||
case SCRIPT>>8: return propMask != UNUSED_SCRIPT && propMask < LIMIT_SCRIPT;
|
||||
case AGE>>8: return propMask < LIMIT_AGE;
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean getUnifiedBinaryProperty(int cp, int propMask) {
|
||||
return getUnifiedBinaryProperty(ucdData, cp, propMask);
|
||||
}
|
||||
|
||||
static public boolean getUnifiedBinaryProperty(UCD ucd, int cp, int propMask) {
|
||||
int enum = propMask >> 8;
|
||||
propMask &= 0xFF;
|
||||
switch (enum) {
|
||||
case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break;
|
||||
return ucd.getCategory(cp) == propMask;
|
||||
case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break;
|
||||
return ucd.getCombiningClass(cp) == propMask;
|
||||
case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break;
|
||||
return ucd.getBidiClass(cp) == propMask;
|
||||
case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break;
|
||||
return ucd.getDecompositionType(cp) == propMask;
|
||||
case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break;
|
||||
return ucd.getNumericType(cp) == propMask;
|
||||
case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break;
|
||||
return ucd.getEastAsianWidth(cp) == propMask;
|
||||
case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break;
|
||||
return ucd.getLineBreak(cp) == propMask;
|
||||
case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break;
|
||||
return ucd.getJoiningType(cp) == propMask;
|
||||
case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break;
|
||||
return ucd.getJoiningGroup(cp) == propMask;
|
||||
case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break;
|
||||
return ucd.getBinaryProperty(cp, propMask);
|
||||
case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break;
|
||||
return ucd.getScript(cp) == propMask;
|
||||
case AGE>>8: if (propMask >= LIMIT_AGE) break;
|
||||
return ucd.getAge(cp) == propMask;
|
||||
}
|
||||
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
|
||||
}
|
||||
|
||||
static final int SHORT = -1, NORMAL = 0, LONG = 1, BOTH = 2;
|
||||
|
||||
public String getUnifiedBinaryPropertyID(int unifiedPropMask) {
|
||||
return getUnifiedBinaryPropertyID(ucdData, unifiedPropMask, NORMAL);
|
||||
}
|
||||
|
||||
public static String getUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask) {
|
||||
String longOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, LONG);
|
||||
String shortOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, SHORT);
|
||||
if (longOne.equals(shortOne)) return longOne;
|
||||
return shortOne + "(" + longOne + ")";
|
||||
}
|
||||
|
||||
public static String getFullUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask, int style) {
|
||||
String pre = "";
|
||||
if ((unifiedPropMask & 0xFF00) != BINARY_PROPERTIES) {
|
||||
String preShort = UCD_Names.ABB_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "=";
|
||||
String preLong = UCD_Names.SHORT_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "=";
|
||||
if (style < LONG) pre = preShort;
|
||||
else if (style == LONG || preShort.equals(preLong)) pre = preLong;
|
||||
else pre = preShort + "(" + preLong + ")";
|
||||
}
|
||||
String shortOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, SHORT);
|
||||
if (shortOne.length() == 0) shortOne = "xx";
|
||||
String longOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, LONG);
|
||||
if (longOne.length() == 0) longOne = "none";
|
||||
|
||||
String post;
|
||||
if (style < LONG) post = shortOne;
|
||||
else if (style == LONG || shortOne.equals(longOne)) post = longOne;
|
||||
else post = shortOne + "(" + longOne + ")";
|
||||
|
||||
if (pre.length() == 0) {
|
||||
pre = post + "=";
|
||||
post = "T";
|
||||
}
|
||||
|
||||
return pre + post;
|
||||
}
|
||||
|
||||
public static String getUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask, int style) {
|
||||
int enum = unifiedPropMask >> 8;
|
||||
byte propMask = (byte)unifiedPropMask;
|
||||
switch (enum) {
|
||||
case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break;
|
||||
if (style != LONG) return ucd.getCategoryID_fromIndex(propMask);
|
||||
return UCD_Names.LONG_GC[propMask];
|
||||
case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break;
|
||||
String s = "";
|
||||
if (style == LONG) {
|
||||
s = getCombiningName(unifiedPropMask);
|
||||
if (s.length() != 0) return s;
|
||||
s = "fixed_";
|
||||
}
|
||||
return s + ucd.getCombiningClassID_fromIndex((short)(0xFF & propMask));
|
||||
case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break;
|
||||
if (style != LONG) return ucd.getBidiClassID_fromIndex(propMask);
|
||||
return UCD_Names.LONG_BC[propMask];
|
||||
case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break;
|
||||
if (style != SHORT) return ucd.getDecompositionTypeID_fromIndex(propMask);
|
||||
return UCD_Names.SHORT_DT[propMask];
|
||||
case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break;
|
||||
if (style != SHORT) return ucd.getNumericTypeID_fromIndex(propMask);
|
||||
return UCD_Names.SHORT_NT[propMask];
|
||||
case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break;
|
||||
if (style != LONG) return ucd.getEastAsianWidthID_fromIndex(propMask);
|
||||
return UCD_Names.SHORT_EA[propMask];
|
||||
case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break;
|
||||
if (style != LONG) return ucd.getLineBreakID_fromIndex(propMask);
|
||||
return UCD_Names.LONG_LB[propMask];
|
||||
case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break;
|
||||
if (style != LONG) return ucd.getJoiningTypeID_fromIndex(propMask);
|
||||
return UCD_Names.LONG_JOINING_TYPE[propMask];
|
||||
case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break;
|
||||
return ucd.getJoiningGroupID_fromIndex(propMask);
|
||||
case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break;
|
||||
if (style != SHORT) return ucd.getBinaryPropertiesID_fromIndex(propMask);
|
||||
return UCD_Names.SHORT_BP[propMask];
|
||||
case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break;
|
||||
if (style != SHORT) return ucd.getScriptID_fromIndex(propMask);
|
||||
return UCD_Names.ABB_SCRIPT[propMask];
|
||||
case AGE>>8: if (propMask >= LIMIT_AGE) break;
|
||||
return ucd.getAgeID_fromIndex(propMask);
|
||||
}
|
||||
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
|
||||
}
|
||||
|
||||
}
|
||||
|
475
tools/unicodetools/com/ibm/text/UCD/Normalizer.java
Normal file
475
tools/unicodetools/com/ibm/text/UCD/Normalizer.java
Normal file
|
@ -0,0 +1,475 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import com.ibm.text.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
|
||||
/**
|
||||
* Implements Unicode Normalization Forms C, D, KC, KD.<br>
|
||||
* See UTR#15 for details.<br>
|
||||
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
|
||||
* The Unicode Consortium makes no expressed or implied warranty of any
|
||||
* kind, and assumes no liability for errors or omissions.
|
||||
* No liability is assumed for incidental and consequential damages
|
||||
* in connection with or arising out of the use of the information here.
|
||||
* @author Mark Davis
|
||||
*/
|
||||
|
||||
public final class Normalizer implements UCD_Types {
|
||||
public static final String copyright =
|
||||
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
|
||||
|
||||
public static boolean SHOW_PROGRESS = false;
|
||||
|
||||
/**
|
||||
* Create a normalizer for a given form.
|
||||
*/
|
||||
public Normalizer(byte form, String unicodeVersion) {
|
||||
this.composition = (form & COMPOSITION_MASK) != 0;
|
||||
this.compatibility = (form & COMPATIBILITY_MASK) != 0;
|
||||
this.data = getData(unicodeVersion);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a normalizer for a given form.
|
||||
*/
|
||||
public Normalizer(byte form) {
|
||||
this(form,"");
|
||||
}
|
||||
|
||||
/**
|
||||
* Masks for the form selector
|
||||
*/
|
||||
public static final byte
|
||||
COMPATIBILITY_MASK = 1,
|
||||
COMPOSITION_MASK = 2;
|
||||
|
||||
/**
|
||||
* Normalization Form Selector
|
||||
*/
|
||||
public static final byte
|
||||
NFD = 0 ,
|
||||
NFKD = COMPATIBILITY_MASK,
|
||||
NFC = COMPOSITION_MASK,
|
||||
NFKC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form,
|
||||
* replacing contents of the target buffer.
|
||||
* @param source the original text, unnormalized
|
||||
* @param target the resulting normalized text
|
||||
*/
|
||||
public StringBuffer normalize(String source, StringBuffer target) {
|
||||
|
||||
// First decompose the source into target,
|
||||
// then compose if the form requires.
|
||||
|
||||
if (source.length() != 0) {
|
||||
internalDecompose(source, target);
|
||||
if (composition) {
|
||||
internalCompose(target);
|
||||
}
|
||||
}
|
||||
return target;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form
|
||||
* @param source the original text, unnormalized
|
||||
* @return target the resulting normalized text
|
||||
*/
|
||||
public String normalize(String source) {
|
||||
return normalize(source, new StringBuffer()).toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form
|
||||
* @param source the original text, unnormalized
|
||||
* @return target the resulting normalized text
|
||||
*/
|
||||
public String normalize(int cp) {
|
||||
return normalize(UTF16.valueOf(cp));
|
||||
}
|
||||
|
||||
/**
|
||||
*/
|
||||
private StringBuffer hasDecompositionBuffer = new StringBuffer();
|
||||
|
||||
public boolean hasDecomposition(int cp) {
|
||||
hasDecompositionBuffer.setLength(0);
|
||||
normalize(UTF16.valueOf(cp), hasDecompositionBuffer);
|
||||
if (hasDecompositionBuffer.length() != 1) return true;
|
||||
return cp != hasDecompositionBuffer.charAt(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Does a quick check to see if the string is in the current form. Checks canonical order and
|
||||
* isAllowed().
|
||||
* @param source source text
|
||||
* @return YES, NO, MAYBE
|
||||
*/
|
||||
/*
|
||||
public static final int NO = 0, YES = 1, MAYBE = -1;
|
||||
|
||||
public int quickCheck(String source) {
|
||||
short lastCanonicalClass = 0;
|
||||
int result = YES;
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
char ch = source.charAt(i);
|
||||
short canonicalClass = data.getCanonicalClass(ch);
|
||||
if (lastCanonicalClass > canonicalClass && canonicalClass != 0) {
|
||||
return NO;
|
||||
}
|
||||
int check = isAllowed(ch);
|
||||
if (check == NO) return NO;
|
||||
if (check == MAYBE) result = MAYBE;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find whether the given character is allowed in the current form.
|
||||
* @return YES, NO, MAYBE
|
||||
*/
|
||||
/*
|
||||
public int isAllowed(char ch) {
|
||||
if (composition) {
|
||||
if (compatibility) {
|
||||
if (data.isCompatibilityExcluded(ch)) {
|
||||
return NO;
|
||||
}
|
||||
} else {
|
||||
if (data.isExcluded(ch)) {
|
||||
return NO;
|
||||
}
|
||||
}
|
||||
if (data.isTrailing(ch)) {
|
||||
return MAYBE;
|
||||
}
|
||||
} else { // decomposition: both NFD and NFKD
|
||||
if (data.normalizationDiffers(compatibility,ch)) return NO;
|
||||
}
|
||||
return YES;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility: Gets the combining class of a character from the
|
||||
* Unicode Character Database. Only a byte is needed, but since they are signed in Java
|
||||
* return an int to forstall problems.
|
||||
* @param ch the source character
|
||||
* @return value from 0 to 255
|
||||
*/
|
||||
|
||||
public short getCanonicalClass(char ch) {
|
||||
return data.getCanonicalClass(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility: Checks whether there is a recursive decomposition of a character from the
|
||||
* Unicode Character Database. It is compatibility or canonical according to the particular
|
||||
* normalizer.
|
||||
* @param ch the source character
|
||||
*/
|
||||
public boolean normalizationDiffers(int ch) {
|
||||
return data.normalizationDiffers(ch, composition, compatibility);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility: Gets recursive decomposition of a character from the
|
||||
* Unicode Character Database.
|
||||
* @param compatibility If false selects the recursive
|
||||
* canonical decomposition, otherwise selects
|
||||
* the recursive compatibility AND canonical decomposition.
|
||||
* @param ch the source character
|
||||
* @param buffer buffer to be filled with the decomposition
|
||||
*/
|
||||
public void getRecursiveDecomposition(char ch, StringBuffer buffer) {
|
||||
data.getRecursiveDecomposition(ch, buffer, compatibility);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility: Gets composition mapping.
|
||||
* @return IntEnumeration with the pair -> value mapping, where the
|
||||
* pair is firstChar << 16 | secondChar.
|
||||
* Will need to be fixed for surrogates.
|
||||
*/
|
||||
/*
|
||||
public IntHashtable.IntEnumeration getComposition() {
|
||||
return data.getComposition();
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
public boolean isTrailing(int cp) {
|
||||
return this.composition ? data.isTrailing(cp) : false;
|
||||
}
|
||||
|
||||
|
||||
// ======================================
|
||||
// PRIVATES
|
||||
// ======================================
|
||||
|
||||
/**
|
||||
* The current form.
|
||||
*/
|
||||
private boolean composition;
|
||||
private boolean compatibility;
|
||||
|
||||
/**
|
||||
* Decomposes text, either canonical or compatibility,
|
||||
* replacing contents of the target buffer.
|
||||
* @param form the normalization form. If COMPATIBILITY_MASK
|
||||
* bit is on in this byte, then selects the recursive
|
||||
* compatibility decomposition, otherwise selects
|
||||
* the recursive canonical decomposition.
|
||||
* @param source the original text, unnormalized
|
||||
* @param target the resulting normalized text
|
||||
*/
|
||||
private void internalDecompose(String source, StringBuffer target) {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
int ch32;
|
||||
for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
|
||||
buffer.setLength(0);
|
||||
ch32 = UTF16.charAt(source, i);
|
||||
data.getRecursiveDecomposition(ch32, buffer, compatibility);
|
||||
|
||||
// add all of the characters in the decomposition.
|
||||
// (may be just the original character, if there was
|
||||
// no decomposition mapping)
|
||||
|
||||
int ch;
|
||||
for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
|
||||
ch = UTF16Plus.charAt(buffer, j);
|
||||
int chClass = data.getCanonicalClass(ch);
|
||||
int k = target.length(); // insertion point
|
||||
if (chClass != 0) {
|
||||
|
||||
// bubble-sort combining marks as necessary
|
||||
|
||||
int ch2;
|
||||
for (; k > 0; k -= UTF16.getCharCount(ch2)) {
|
||||
ch2 = UTF16Plus.charAt(target, k-1);
|
||||
if (data.getCanonicalClass(ch2) <= chClass) break;
|
||||
}
|
||||
}
|
||||
target.insert(k, UTF16.valueOf(ch));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Composes text in place. Target must already
|
||||
* have been decomposed.
|
||||
* Uses UTF16, which is a utility class for supplementary character support in Java.
|
||||
* @param target input: decomposed text.
|
||||
* output: the resulting normalized text.
|
||||
*/
|
||||
private void internalCompose(StringBuffer target) {
|
||||
int starterPos = 0;
|
||||
int starterCh = UTF16Plus.charAt(target,0);
|
||||
int compPos = UTF16.getCharCount(starterCh); // length of last composition
|
||||
int lastClass = data.getCanonicalClass(starterCh);
|
||||
if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
|
||||
int oldLen = target.length();
|
||||
|
||||
// Loop on the decomposed characters, combining where possible
|
||||
|
||||
int ch;
|
||||
for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
|
||||
ch = UTF16Plus.charAt(target, decompPos);
|
||||
if (SHOW_PROGRESS) System.out.println(Utility.hex(target)
|
||||
+ ", decompPos: " + decompPos
|
||||
+ ", compPos: " + compPos
|
||||
+ ", ch: " + Utility.hex(ch)
|
||||
);
|
||||
int chClass = data.getCanonicalClass(ch);
|
||||
int composite = data.getPairwiseComposition(starterCh, ch);
|
||||
if (composite != data.NOT_COMPOSITE
|
||||
&& (lastClass < chClass || lastClass == 0)) {
|
||||
UTF16.setCharAt(target, starterPos, composite);
|
||||
// we know that we will only be replacing non-supplementaries by non-supplementaries
|
||||
// so we don't have to adjust the decompPos
|
||||
starterCh = composite;
|
||||
} else {
|
||||
if (chClass == 0) {
|
||||
starterPos = compPos;
|
||||
starterCh = ch;
|
||||
}
|
||||
lastClass = chClass;
|
||||
UTF16.setCharAt(target, compPos, ch);
|
||||
if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
|
||||
System.out.println("ADJUSTING: " + Utility.hex(target));
|
||||
decompPos += target.length() - oldLen;
|
||||
oldLen = target.length();
|
||||
}
|
||||
compPos += UTF16.getCharCount(ch);
|
||||
}
|
||||
}
|
||||
target.setLength(compPos);
|
||||
}
|
||||
|
||||
static class Stub {
|
||||
private UCD ucd;
|
||||
private HashMap compTable = new HashMap();
|
||||
private BitSet isSecond = new BitSet();
|
||||
private BitSet canonicalRecompose = new BitSet();
|
||||
private BitSet compatibilityRecompose = new BitSet();
|
||||
static final int NOT_COMPOSITE = 0xFFFF;
|
||||
|
||||
Stub(String version) {
|
||||
ucd = UCD.make(version);
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if (!ucd.isAssigned(i)) continue;
|
||||
if (ucd.isPUA(i)) continue;
|
||||
if (ucd.isTrailingJamo(i)) isSecond.set(i);
|
||||
byte dt = ucd.getDecompositionType(i);
|
||||
if (dt != CANONICAL) continue;
|
||||
if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
|
||||
try {
|
||||
String s = ucd.getDecompositionMapping(i);
|
||||
int len = UTF16.countCodePoint(s);
|
||||
if (len != 2) {
|
||||
if (len > 2) throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
|
||||
continue;
|
||||
}
|
||||
int a = UTF16.charAt(s, 0);
|
||||
if (ucd.getCombiningClass(a) != 0) continue;
|
||||
|
||||
int b = UTF16.charAt(s, UTF16.getCharCount(a));
|
||||
isSecond.set(b);
|
||||
|
||||
// have a recomposition, so set the bit
|
||||
canonicalRecompose.set(i);
|
||||
|
||||
// set the compatibility recomposition bit
|
||||
// ONLY if the component characters
|
||||
// don't compatibility decompose
|
||||
if (ucd.getDecompositionType(a) <= CANONICAL
|
||||
&& ucd.getDecompositionType(b) <= CANONICAL) {
|
||||
compatibilityRecompose.set(i);
|
||||
}
|
||||
|
||||
long key = (((long)a)<<32) | b;
|
||||
|
||||
/*if (i == '\u1E0A' || key == 0x004400000307) {
|
||||
System.out.println(Utility.hex(s));
|
||||
System.out.println(Utility.hex(i));
|
||||
System.out.println(Utility.hex(key));
|
||||
}*/
|
||||
compTable.put(new Long(key), new Integer(i));
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("Error: {0}", new Object[]{ucd.toString(i)}, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
// process compatibilityRecompose
|
||||
// have to do this afterwards, since we don't know whether the pieces
|
||||
// are allowable until we have processed all the characters
|
||||
/*
|
||||
Iterator it = compTable.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Long key = (Long)it.next();
|
||||
int cp = compTable.get(key);
|
||||
long keyLong = key.longValue();
|
||||
int first = (int)(keyLong >>> 32);
|
||||
int second = (int)keyLong;
|
||||
if (ucd.
|
||||
*/
|
||||
}
|
||||
/*
|
||||
Problem: differs: true, call: false U+0385 GREEK DIALYTIKA TONOS
|
||||
Problem: differs: true, call: false U+03D3 GREEK UPSILON WITH ACUTE AND HOOK SYMBOL
|
||||
Problem: differs: true, call: false U+03D4 GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL
|
||||
Problem: differs: true, call: false U+1E9B LATIN SMALL LETTER LONG S WITH DOT ABOVE
|
||||
Problem: differs: true, call: false U+1FC1 GREEK DIALYTIKA AND PERISPOMENI
|
||||
Problem: differs: true, call: false U+1FCD GREEK PSILI AND VARIA
|
||||
Problem: differs: true, call: false U+1FCE GREEK PSILI AND OXIA
|
||||
Problem: differs: true, call: false U+1FCF GREEK PSILI AND PERISPOMENI
|
||||
Problem: differs: true, call: false U+1FDD GREEK DASIA AND VARIA
|
||||
Problem: differs: true, call: false U+1FDE GREEK DASIA AND OXIA
|
||||
Problem: differs: true, call: false U+1FDF GREEK DASIA AND PERISPOMENI
|
||||
Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
|
||||
*/
|
||||
|
||||
short getCanonicalClass(int cp) {
|
||||
return ucd.getCombiningClass(cp);
|
||||
}
|
||||
|
||||
boolean isTrailing(int cp) {
|
||||
return isSecond.get(cp);
|
||||
}
|
||||
|
||||
boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) {
|
||||
byte dt = ucd.getDecompositionType(cp);
|
||||
if (!composition) {
|
||||
if (compatibility) return dt >= CANONICAL;
|
||||
else return dt == CANONICAL;
|
||||
} else {
|
||||
// almost the same, except that we add back in the characters
|
||||
// that RECOMPOSE
|
||||
if (compatibility) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
|
||||
else return dt == CANONICAL && !canonicalRecompose.get(cp);
|
||||
}
|
||||
}
|
||||
|
||||
public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compatibility) {
|
||||
byte dt = ucd.getDecompositionType(cp);
|
||||
// we know we decompose all CANONICAL, plus > CANONICAL if compatibility is TRUE.
|
||||
if (dt == CANONICAL || dt > CANONICAL && compatibility) {
|
||||
String s = ucd.getDecompositionMapping(cp);
|
||||
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(s, i);
|
||||
getRecursiveDecomposition(cp, buffer, compatibility);
|
||||
}
|
||||
} else {
|
||||
UTF16.append(buffer, cp);
|
||||
}
|
||||
}
|
||||
|
||||
int getPairwiseComposition(int starterCh, int ch) {
|
||||
int hangulPoss = UCD.composeHangul(starterCh, ch);
|
||||
if (hangulPoss != 0xFFFF) return hangulPoss;
|
||||
Object obj = compTable.get(new Long((((long)starterCh)<<32) | ch));
|
||||
if (obj == null) return 0xFFFF;
|
||||
return ((Integer)obj).intValue();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains normalization data from the Unicode Character Database.
|
||||
* use false for the minimal set, true for the real set.
|
||||
*/
|
||||
private Stub data;
|
||||
|
||||
private static HashMap versionCache = new HashMap();
|
||||
|
||||
private static Stub getData (String version) {
|
||||
if (version.length() == 0) version = UCD.latestVersion;
|
||||
Stub result = (Stub)versionCache.get(version);
|
||||
if (result == null) {
|
||||
result = new Stub(version);
|
||||
versionCache.put(version, result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Just accessible for testing.
|
||||
*/
|
||||
/*
|
||||
boolean isExcluded (char ch) {
|
||||
return data.isExcluded(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Just accessible for testing.
|
||||
*/
|
||||
/*
|
||||
String getRawDecompositionMapping (char ch) {
|
||||
return data.getRawDecompositionMapping(ch);
|
||||
}
|
||||
//*/
|
||||
}
|
203
tools/unicodetools/com/ibm/text/UCD/PropertyLister.java
Normal file
203
tools/unicodetools/com/ibm/text/UCD/PropertyLister.java
Normal file
|
@ -0,0 +1,203 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.*;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
|
||||
abstract public class PropertyLister implements UCD_Types {
|
||||
|
||||
static final boolean COMPRESS_NAMES = false;
|
||||
static final boolean DROP_INDICATORS = true;
|
||||
|
||||
|
||||
protected UCD ucdData;
|
||||
protected PrintStream output;
|
||||
protected boolean showOnConsole;
|
||||
protected boolean usePropertyComment = true;
|
||||
protected int firstRealCp = -2;
|
||||
protected int lastRealCp = -2;
|
||||
protected boolean alwaysBreaks = false; // set to true if property only breaks
|
||||
|
||||
public static final byte INCLUDE = 0, BREAK = 1, CONTINUE = 2, EXCLUDE = 3;
|
||||
|
||||
/**
|
||||
* @return status. Also have access to firstRealCp, lastRealCp
|
||||
*/
|
||||
abstract public byte status(int cp);
|
||||
|
||||
public String headerString() {
|
||||
return "";
|
||||
}
|
||||
|
||||
public String propertyName(int cp) {
|
||||
return "";
|
||||
}
|
||||
|
||||
public String optionalName(int cp) {
|
||||
return "";
|
||||
}
|
||||
|
||||
public String optionalComment(int cp) {
|
||||
if (!usePropertyComment) return "";
|
||||
int cat = ucdData.getCategory(cp);
|
||||
if (cat == Lt || cat == Ll || cat == Lu) return "L&";
|
||||
return ucdData.getCategoryID(cp);
|
||||
}
|
||||
|
||||
public int minPropertyWidth() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public void format(int startCp, int endCp, int realCount) {
|
||||
try {
|
||||
String prop = propertyName(startCp);
|
||||
if (prop.length() > 0) prop = "; " + prop;
|
||||
String opt = optionalName(startCp);
|
||||
if (opt.length() > 0) opt = "; " + opt;
|
||||
String optCom = optionalComment(startCp);
|
||||
if (optCom.length() > 0) optCom += " ";
|
||||
String startName = getKenName(startCp);
|
||||
String line;
|
||||
String pgap = Utility.repeat(" ", minPropertyWidth() - prop.length() - opt.length());
|
||||
if (startCp != endCp) {
|
||||
String endName = getKenName(endCp);
|
||||
int bridge = endCp - startCp + 1 - realCount;
|
||||
String count = (bridge == 0) ? "" + realCount : realCount + "/" + bridge;
|
||||
String countStr = Utility.repeat(" ", 3-count.length()) + "[" + count + "] ";
|
||||
String gap = Utility.repeat(" ", 12 - width(startCp) - width(endCp));
|
||||
|
||||
line = Utility.hex(startCp,4) + ".." + Utility.hex(endCp,4) + gap
|
||||
+ prop + opt + pgap + " # " + optCom
|
||||
+ countStr;
|
||||
if (startName.length() != 0 || endName.length() != 0) {
|
||||
int com = 0;
|
||||
if (COMPRESS_NAMES) com = commonInitialWords(startName, endName);
|
||||
if (com == 0) {
|
||||
line += startName + ".." + endName;
|
||||
} else {
|
||||
line += startName.substring(0,com)
|
||||
+ "(" + startName.substring(com) + ".." + endName.substring(com) + ")";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
String gap = alwaysBreaks
|
||||
? Utility.repeat(" ", 6 - width(startCp))
|
||||
: Utility.repeat(" ", 14 - width(startCp));
|
||||
String gap2 = alwaysBreaks
|
||||
? " "
|
||||
: " ";
|
||||
line = Utility.hex(startCp,4) + gap
|
||||
+ prop + opt + pgap + " # " + optCom + gap2
|
||||
+ startName;
|
||||
}
|
||||
output.println(line);
|
||||
if (showOnConsole) System.out.println(line);
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("Format error {0}, {1}",
|
||||
new Object[]{new Integer(startCp), new Integer(endCp)}, e);
|
||||
}
|
||||
}
|
||||
|
||||
int width(int cp) {
|
||||
return cp <= 0xFFFF ? 4
|
||||
: cp <= 0xFFFFF ? 5
|
||||
: 6;
|
||||
}
|
||||
|
||||
String getKenName(int cp) {
|
||||
String result = ucdData.getName(cp);
|
||||
if (result == null) return "";
|
||||
if (DROP_INDICATORS && result.charAt(0) == '<') {
|
||||
if (cp < 0xFF) return "<control>";
|
||||
return "";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return common initial substring length ending with SPACE or HYPHEN-MINUS. 0 if there is none
|
||||
*/
|
||||
public static int commonInitialWords(String a, String b) {
|
||||
if (a.length() > b.length()) {
|
||||
String temp = a;
|
||||
a = b;
|
||||
b = temp;
|
||||
}
|
||||
int lastSpace = 0;
|
||||
for (int i = 0; i < a.length(); ++i) {
|
||||
char ca = a.charAt(i);
|
||||
char cb = b.charAt(i);
|
||||
if (ca != cb) return lastSpace;
|
||||
if (ca == ' ' || ca == '-') lastSpace = i + 1;
|
||||
}
|
||||
if (b.length() == a.length() || b.charAt(a.length()) == ' ' || b.charAt(a.length()) == '-') {
|
||||
lastSpace = a.length();
|
||||
}
|
||||
return lastSpace;
|
||||
}
|
||||
|
||||
public int print() {
|
||||
int count = 0;
|
||||
firstRealCp = -1;
|
||||
byte firstRealCpCat = -1;
|
||||
lastRealCp = -1;
|
||||
int realRangeCount = 0;
|
||||
|
||||
String header = headerString();
|
||||
if (header.length() != 0) {
|
||||
output.println(header);
|
||||
output.println();
|
||||
}
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
byte s = status(cp);
|
||||
if (s == INCLUDE && firstRealCp != -1) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Lt || cat == Ll) cat = Lu;
|
||||
if (cat != firstRealCpCat) s = BREAK;
|
||||
}
|
||||
|
||||
switch(s) {
|
||||
case CONTINUE:
|
||||
break; // do nothing
|
||||
case INCLUDE:
|
||||
if (firstRealCp == -1) {
|
||||
firstRealCp = cp;
|
||||
firstRealCpCat = ucdData.getCategory(firstRealCp);
|
||||
if (firstRealCpCat == Lt || firstRealCpCat == Ll) firstRealCpCat = Lu;
|
||||
}
|
||||
lastRealCp = cp;
|
||||
count++;
|
||||
realRangeCount++;
|
||||
break;
|
||||
case BREAK:
|
||||
if (firstRealCp != -1) {
|
||||
format(firstRealCp, lastRealCp, realRangeCount);
|
||||
}
|
||||
lastRealCp = firstRealCp = cp;
|
||||
firstRealCpCat = ucdData.getCategory(firstRealCp);
|
||||
if (firstRealCpCat == Lt || firstRealCpCat == Ll) firstRealCpCat = Lu;
|
||||
|
||||
realRangeCount = 1;
|
||||
count++;
|
||||
break;
|
||||
case EXCLUDE:
|
||||
if (firstRealCp != -1) {
|
||||
format(firstRealCp, lastRealCp, realRangeCount);
|
||||
firstRealCp = -1;
|
||||
realRangeCount = 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (firstRealCp != -1) {
|
||||
format(firstRealCp, lastRealCp, realRangeCount);
|
||||
}
|
||||
|
||||
if (count == 0) System.out.println("WARNING -- ZERO COUNT FOR " + header);
|
||||
output.println();
|
||||
output.println("# Total code points: " + count);
|
||||
output.println();
|
||||
return count;
|
||||
}
|
||||
}
|
473
tools/unicodetools/com/ibm/text/UCD/TestData.java
Normal file
473
tools/unicodetools/com/ibm/text/UCD/TestData.java
Normal file
|
@ -0,0 +1,473 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
import java.text.DateFormat;
|
||||
import java.text.SimpleDateFormat;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public class TestData implements UCD_Types {
|
||||
|
||||
public static void main (String[] args) throws IOException {
|
||||
System.out.println("START");
|
||||
ucd = UCD.make();
|
||||
System.out.println("Loaded UCD " + ucd.getVersion() + " " + (new Date(ucd.getDate())));
|
||||
|
||||
checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F");
|
||||
checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD");
|
||||
|
||||
int mask = 0;
|
||||
|
||||
if (false) {
|
||||
|
||||
generateVerticalSlice(BIDI_CLASS, BIDI_CLASS+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedBidiClass-3.1.1d1.txt");
|
||||
|
||||
|
||||
mask = Utility.setBits(0, DerivedPropertyLister.FC_NFKC_Closure, DerivedPropertyLister.ExpandsOnNFKC);
|
||||
mask = Utility.clearBit(mask, DerivedPropertyLister.FullCompInclusion);
|
||||
generateDerived(mask, HEADER_DERIVED, "DerivedNormalizationProperties-3.1.0d1.txt");
|
||||
|
||||
generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedEastAsianWidth-3.1.0d1.txt");
|
||||
|
||||
generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedGeneralCategory-3.1.0d1.txt");
|
||||
generateVerticalSlice(COMBINING_CLASS, COMBINING_CLASS+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedCombiningClass-3.1.0d1.txt");
|
||||
generateVerticalSlice(DECOMPOSITION_TYPE, DECOMPOSITION_TYPE+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedDecompositionType-3.1.0d1.txt");
|
||||
generateVerticalSlice(NUMERIC_TYPE, NUMERIC_TYPE+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedNumericType-3.1.0d1.txt");
|
||||
generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedEastAsianWidth-3.1.0d1.txt");
|
||||
generateVerticalSlice(JOINING_TYPE, JOINING_TYPE+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedJoiningType-3.1.0d1.txt");
|
||||
generateVerticalSlice(JOINING_GROUP, JOINING_GROUP+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedJoiningGroup-3.1.0d1.txt");
|
||||
generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES+1, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedBinaryProperties-3.1.0d1.txt");
|
||||
generateVerticalSlice(LIMIT_ENUM, LIMIT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedNumericValues-3.1.0d1.txt");
|
||||
|
||||
mask = Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf);
|
||||
generateDerived(mask, HEADER_DERIVED, "DerivedCoreProperties-3.1.0d1.txt");
|
||||
|
||||
generateVerticalSlice(LINE_BREAK, LINE_BREAK+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedLineBreak-3.1.0d1.txt");
|
||||
|
||||
generateVerticalSlice(SCRIPT+1, SCRIPT + NEXT_ENUM, KEEP_SPECIAL, HEADER_SCRIPTS, "Scripts-3.1.0d4.txt");
|
||||
|
||||
generateVerticalSlice(BINARY_PROPERTIES + White_space, BINARY_PROPERTIES + Noncharacter_Code_Point + 1,
|
||||
KEEP_SPECIAL, HEADER_EXTEND, "PropList-3.1.0d5.txt");
|
||||
|
||||
|
||||
writeNormalizerTestSuite("NormalizationTest-3.1.0d1.txt");
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//generateDerived(Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf),
|
||||
// HEADER_DERIVED, "DerivedPropData2-3.1.0d1.txt");
|
||||
//generateVerticalSlice(SCRIPT, SCRIPT+1, KEEP_SPECIAL, "ScriptCommon-3.1.0d1.txt");
|
||||
//listStrings("LowerCase-3.1.0d1.txt", 0,0);
|
||||
//generateVerticalSlice(0, LIMIT_ENUM, SKIP_SPECIAL, PROPLIST1, "DerivedPropData1-3.1.0d1.txt");
|
||||
|
||||
// AGE stuff
|
||||
//UCD ucd = UCD.make();
|
||||
//System.out.println(ucd.getAgeID(0x61));
|
||||
//System.out.println(ucd.getAgeID(0x2FA1D));
|
||||
|
||||
|
||||
//generateCompExclusions();
|
||||
System.out.println("END");
|
||||
}
|
||||
|
||||
static Normalizer nfkc = new Normalizer(Normalizer.NFKC);
|
||||
|
||||
public static void checkHoffman(String test) {
|
||||
String result = nfkc.normalize(test);
|
||||
System.out.println(Utility.hex(test) + " => " + Utility.hex(result));
|
||||
System.out.println();
|
||||
show(test, 0);
|
||||
System.out.println();
|
||||
show(result, 0);
|
||||
}
|
||||
|
||||
public static void show(String s, int indent) {
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
String cc = " " + ucd.getCombiningClass(cp);
|
||||
cc = Utility.repeat(" ", 4 - cc.length()) + cc;
|
||||
System.out.println(Utility.repeat(" ", indent) + ucd.getCode(cp) + cc + " " + ucd.getName(cp));
|
||||
String decomp = nfkc.normalize(cp);
|
||||
if (!decomp.equals(UTF32.valueOf32(cp))) {
|
||||
show(decomp, indent + 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd' 'HH:mm:ss.S' GMT'");
|
||||
|
||||
static {
|
||||
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
|
||||
}
|
||||
|
||||
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
|
||||
|
||||
public static String fixFile(String s) {
|
||||
int len = s.length();
|
||||
if (!s.endsWith(".txt")) return s;
|
||||
if (s.charAt(len-6) != 'd') return s;
|
||||
char c = s.charAt(len-5);
|
||||
if (c < '0' || '9' < c) return s;
|
||||
System.out.println("Fixing File Name");
|
||||
return s.substring(0,len-6) + s.substring(len-4);
|
||||
}
|
||||
|
||||
static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2;
|
||||
|
||||
public static void doHeader(String fileName, PrintStream output, int headerChoice) {
|
||||
output.println("# " + fixFile(fileName));
|
||||
output.println("#");
|
||||
if (headerChoice == HEADER_SCRIPTS) {
|
||||
output.println("# For documentation, see UTR #24: Script Names");
|
||||
output.println("# http://www.unicode.org/unicode/reports/tr24/");
|
||||
} else if (headerChoice == HEADER_EXTEND) {
|
||||
output.println("# Unicode Character Database: Extended Properties");
|
||||
output.println("# For documentation, see PropList.html");
|
||||
} else {
|
||||
output.println("# Unicode Character Database: Derived Property Data");
|
||||
output.println("# Generated algorithmically from the Unicode Character Database");
|
||||
output.println("# For documentation, see DerivedProperties.html");
|
||||
}
|
||||
output.println("# Date: " + myDateFormat.format(new Date()) + " [MD]");
|
||||
output.println("# Note: Unassigned and Noncharacter codepoints are omitted,");
|
||||
output.println("# except when listing Noncharacter or Cn.");
|
||||
output.println("# ================================================");
|
||||
output.println();
|
||||
}
|
||||
|
||||
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
|
||||
ucd = UCD.make("310");
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + fileName));
|
||||
doHeader(fileName, output, headerChoice);
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
if ((bitMask & (1<<i)) == 0) continue;
|
||||
if (i >= DerivedPropertyLister.LIMIT) break;
|
||||
System.out.print('.');
|
||||
output.println("# ================================================");
|
||||
output.println();
|
||||
new DerivedPropertyLister(ucd, i, output).print();
|
||||
}
|
||||
output.close();
|
||||
}
|
||||
|
||||
/*
|
||||
public static void listStrings(String file, int type, int subtype) throws IOException {
|
||||
ucd = UCD.make("310");
|
||||
UCD ucd30 = UCD.make("300");
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
|
||||
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if ((i & 0xFFF) == 0) System.out.println("# " + i);
|
||||
if (!ucd.isRepresented(i)) continue;
|
||||
if (ucd30.isRepresented(i)) continue;
|
||||
String string = "";
|
||||
switch(type) {
|
||||
case 0: string = ucd.getSimpleLowercase(i);
|
||||
}
|
||||
if (UTF32.length32(string) == 1 && UTF32.char32At(string,0) == i) continue;
|
||||
output.println(Utility.hex(i) + "; C; " + Utility.hex(string) + "; # " + ucd.getName(i));
|
||||
}
|
||||
output.close();
|
||||
}
|
||||
*/
|
||||
|
||||
public static void generateCompExclusions() throws IOException {
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "CompositionExclusionsDelta.txt"));
|
||||
new CompLister(output).print();
|
||||
output.close();
|
||||
}
|
||||
|
||||
static class CompLister extends PropertyLister {
|
||||
UCD oldUCD;
|
||||
int oldLength = 0;
|
||||
|
||||
public CompLister(PrintStream output) {
|
||||
this.output = output;
|
||||
ucdData = UCD.make("310");
|
||||
oldUCD = UCD.make("300");
|
||||
showOnConsole = true;
|
||||
}
|
||||
public String propertyName(int cp) {
|
||||
return UTF32.length32(ucdData.getDecompositionMapping(cp)) + "";
|
||||
}
|
||||
public byte status(int cp) {
|
||||
if (ucdData.getDecompositionType(cp) == CANONICAL
|
||||
&& oldUCD.getDecompositionType(cp) != CANONICAL) {
|
||||
int temp = oldLength;
|
||||
oldLength = UTF32.length32(ucdData.getDecompositionMapping(cp));
|
||||
if (temp != oldLength) return BREAK;
|
||||
return INCLUDE;
|
||||
}
|
||||
return EXCLUDE;
|
||||
}
|
||||
}
|
||||
|
||||
static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
|
||||
|
||||
public static void generateVerticalSlice(int startEnum, int endEnum, byte skipSpecial, int headerChoice, String file) throws IOException {
|
||||
|
||||
//System.out.println(ucd.toString(0x1E0A));
|
||||
/*
|
||||
System.out.println(ucd.getData(0xFFFF));
|
||||
System.out.println(ucd.getData(0x100000));
|
||||
System.out.println(ucd.getData(0x100000-1));
|
||||
System.out.println(ucd.getData(0x100000-2));
|
||||
System.out.println(ucd.getData(0x100000-3));
|
||||
if (true) return;
|
||||
String test2 = ucd.getName(0x2A6D6);
|
||||
//*/
|
||||
|
||||
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
|
||||
doHeader(file, output, headerChoice);
|
||||
int last = -1;
|
||||
for (int i = startEnum; i < endEnum; ++i) {
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
|
||||
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|
||||
|| i == (CATEGORY | UNUSED_CATEGORY)
|
||||
|| i == (BINARY_PROPERTIES | Non_break)
|
||||
|| i == (JOINING_TYPE | JT_U)
|
||||
|| i == (SCRIPT | UNUSED_SCRIPT)
|
||||
|| i == (JOINING_GROUP | NO_SHAPING)
|
||||
) continue; // skip zero case
|
||||
if (skipSpecial == SKIP_SPECIAL
|
||||
&& i >= (BINARY_PROPERTIES | CompositionExclusion)
|
||||
&& i < (AGE + NEXT_ENUM)) continue;
|
||||
if ((last & 0xFF00) != (i & 0xFF00) && (i <= BINARY_PROPERTIES || i >= SCRIPT)) {
|
||||
output.println();
|
||||
output.println("# ================================================");
|
||||
output.println("# " + UCD_Names.UNIFIED_PROPERTIES[i>>8]);
|
||||
output.println("# ================================================");
|
||||
output.println();
|
||||
System.out.println();
|
||||
System.out.println(UCD_Names.UNIFIED_PROPERTIES[i>>8]);
|
||||
last = i;
|
||||
} else {
|
||||
output.println("# ================================================");
|
||||
output.println();
|
||||
}
|
||||
System.out.print(".");
|
||||
new MyPropertyLister(ucd, i, output).print();
|
||||
}
|
||||
if (endEnum == LIMIT_ENUM) {
|
||||
output.println();
|
||||
output.println("# ================================================");
|
||||
output.println("# Numeric Values (from UnicodeData.txt, field 6/7/8)");
|
||||
output.println("# ================================================");
|
||||
output.println();
|
||||
System.out.println();
|
||||
System.out.println("@NUMERIC VALUES");
|
||||
|
||||
Set floatSet = new TreeSet();
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
float nv = ucd.getNumericValue(i);
|
||||
if (Float.isNaN(nv)) continue;
|
||||
floatSet.add(new Float(nv));
|
||||
}
|
||||
Iterator it = floatSet.iterator();
|
||||
while(it.hasNext()) {
|
||||
new MyFloatLister(ucd, ((Float)it.next()).floatValue(), output).print();
|
||||
output.println();
|
||||
System.out.print(".");
|
||||
}
|
||||
}
|
||||
output.close();
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
static UCD ucd;
|
||||
|
||||
static public Normalizer formC, formD, formKC, formKD;
|
||||
|
||||
static public void writeNormalizerTestSuite(String fileName) throws IOException {
|
||||
|
||||
PrintWriter log = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(GEN_DIR + fileName),
|
||||
"UTF8"),
|
||||
32*1024));
|
||||
formC = new Normalizer(Normalizer.NFC);
|
||||
formD = new Normalizer(Normalizer.NFD);
|
||||
formKC = new Normalizer(Normalizer.NFKC);
|
||||
formKD = new Normalizer(Normalizer.NFKD);
|
||||
|
||||
log.println("# " + fixFile(fileName));
|
||||
log.println("#");
|
||||
log.println("# Normalization Test Suite");
|
||||
log.println("# Date: " + myDateFormat.format(new Date()) + " [MD]");
|
||||
log.println("# Format:");
|
||||
log.println("#");
|
||||
log.println("# Columns (c1, c2,...) are separated by semicolons");
|
||||
log.println("# Comments are indicated with hash marks");
|
||||
log.println("#");
|
||||
log.println("# CONFORMANCE:");
|
||||
log.println("# 1. The following invariants must be true for all conformant implementations");
|
||||
log.println("#");
|
||||
log.println("# NFC");
|
||||
log.println("# c2 == NFC(c1) == NFC(c2) == NFC(c3)");
|
||||
log.println("# c4 == NFC(c4) == NFC(c5)");
|
||||
log.println("#");
|
||||
log.println("# NFD");
|
||||
log.println("# c3 == NFD(c1) == NFD(c2) == NFD(c3)");
|
||||
log.println("# c5 == NFD(c4) == NFD(c5");
|
||||
log.println("#");
|
||||
log.println("# NFKC");
|
||||
log.println("# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)");
|
||||
log.println("#");
|
||||
log.println("# NFKD");
|
||||
log.println("# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)");
|
||||
log.println("#");
|
||||
log.println("# 2. For every assigned Unicode 3.1.0 code point X that is not specifically");
|
||||
log.println("# listed in Part 1, the following invariants must be true for all conformant");
|
||||
log.println("# implementations:");
|
||||
log.println("#");
|
||||
log.println("# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)");
|
||||
|
||||
System.out.println("Writing Part 1");
|
||||
|
||||
log.println("#");
|
||||
log.println("@Part0 # Specific cases");
|
||||
log.println("#");
|
||||
|
||||
for (int j = 0; j < testSuiteCases.length; ++j) {
|
||||
writeLine(testSuiteCases[j], log, false);
|
||||
}
|
||||
|
||||
System.out.println("Writing Part 2");
|
||||
|
||||
log.println("#");
|
||||
log.println("@Part1 # Character by character test");
|
||||
log.println("# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.");
|
||||
log.println("#");
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
if (!ucd.isAssigned(ch)) continue;
|
||||
if (ucd.isPUA(ch)) continue;
|
||||
String cc = UTF32.valueOf32(ch);
|
||||
writeLine(cc,log, true);
|
||||
}
|
||||
Utility.fixDot();
|
||||
|
||||
System.out.println("Finding Examples");
|
||||
|
||||
String[] example = new String[256];
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
if (!ucd.isAssigned(ch)) continue;
|
||||
if (ucd.isPUA(ch)) continue;
|
||||
int cc = ucd.getCombiningClass(ch);
|
||||
if (example[cc] == null) example[cc] = UTF32.valueOf32(ch);
|
||||
}
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("Writing Part 3");
|
||||
|
||||
log.println("#");
|
||||
log.println("@Part2 # Canonical Order Test");
|
||||
log.println("#");
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
if (!ucd.isAssigned(ch)) continue;
|
||||
if (ucd.isPUA(ch)) continue;
|
||||
short c = ucd.getCombiningClass(ch);
|
||||
if (c == 0) continue;
|
||||
|
||||
// add character with higher class, same class, lower class
|
||||
|
||||
String sample = "";
|
||||
for (int i = c+1; i < example.length; ++i) {
|
||||
if (example[i] == null) continue;
|
||||
sample += example[i];
|
||||
break;
|
||||
}
|
||||
sample += example[c];
|
||||
for (int i = c-1; i > 0; --i) {
|
||||
if (example[i] == null) continue;
|
||||
sample += example[i];
|
||||
break;
|
||||
}
|
||||
|
||||
writeLine("a" + sample + UTF32.valueOf32(ch) + "b", log, false);
|
||||
writeLine("a" + UTF32.valueOf32(ch) + sample + "b", log, false);
|
||||
}
|
||||
Utility.fixDot();
|
||||
log.println("#");
|
||||
log.println("# END OF FILE");
|
||||
log.close();
|
||||
}
|
||||
|
||||
static void writeLine(String cc, PrintWriter log, boolean check) {
|
||||
String c = formC.normalize(cc);
|
||||
String d = formD.normalize(cc);
|
||||
String kc = formKC.normalize(cc);
|
||||
String kd = formKD.normalize(cc);
|
||||
if (check & cc.equals(c) && cc.equals(d) && cc.equals(kc) && cc.equals(kd)) return;
|
||||
log.println(
|
||||
Utility.hex(cc," ") + ";" + Utility.hex(c," ") + ";" + Utility.hex(d," ") + ";"
|
||||
+ Utility.hex(kc," ") + ";" + Utility.hex(kd," ")
|
||||
+ "; # ("
|
||||
+ comma(cc) + "; " + comma(c) + "; " + comma(d) + "; " + comma(kc) + "; " + comma(kd) + "; "
|
||||
+ ") " + ucd.getName(cc));
|
||||
}
|
||||
|
||||
static StringBuffer commaResult = new StringBuffer();
|
||||
|
||||
// not recursive!!!
|
||||
static final String comma(String s) {
|
||||
commaResult.setLength(0);
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(i)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (ucd.getCategory(cp) == Mn) commaResult.append('\u25CC');
|
||||
UTF32.append32(commaResult, cp);
|
||||
}
|
||||
return commaResult.toString();
|
||||
}
|
||||
|
||||
static final String[] testSuiteCases = {
|
||||
"\u1E0A",
|
||||
"\u1E0C",
|
||||
"\u1E0A\u0323",
|
||||
"\u1E0C\u0307",
|
||||
"D\u0307\u0323",
|
||||
"D\u0323\u0307",
|
||||
"\u1E0A\u031B",
|
||||
"\u1E0C\u031B",
|
||||
"\u1E0A\u031B\u0323",
|
||||
"\u1E0C\u031B\u0307",
|
||||
"D\u031B\u0307\u0323",
|
||||
"D\u031B\u0323\u0307",
|
||||
"\u00C8",
|
||||
"\u0112",
|
||||
"E\u0300",
|
||||
"E\u0304",
|
||||
"\u1E14",
|
||||
"\u0112\u0300",
|
||||
"\u1E14\u0304",
|
||||
"E\u0304\u0300",
|
||||
"E\u0300\u0304",
|
||||
};
|
||||
|
||||
}
|
185
tools/unicodetools/com/ibm/text/UCD/TestNormalization.java
Normal file
185
tools/unicodetools/com/ibm/text/UCD/TestNormalization.java
Normal file
|
@ -0,0 +1,185 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public final class TestNormalization {
|
||||
static final String DIR = "C:\\Documents and Settings\\Davis\\My Documents\\UnicodeData\\Update 3.0.1\\";
|
||||
static final boolean SKIP_FILE = true;
|
||||
|
||||
static PrintWriter out = null;
|
||||
static BufferedReader in = null;
|
||||
|
||||
static Normalizer nfc;
|
||||
static Normalizer nfd;
|
||||
static Normalizer nfkc;
|
||||
static Normalizer nfkd;
|
||||
static UCD ucd;
|
||||
|
||||
static BitSet charsListed = new BitSet(0x110000);
|
||||
static int errorCount = 0;
|
||||
static int lineErrorCount = 0;
|
||||
static String originalLine = "";
|
||||
static String lastLine = "";
|
||||
|
||||
public static void main(String[] args) throws java.io.IOException {
|
||||
System.out.println("Creating Normalizers");
|
||||
ucd = UCD.make("");
|
||||
|
||||
nfc = new Normalizer(Normalizer.NFC);
|
||||
nfd = new Normalizer(Normalizer.NFD);
|
||||
nfkc = new Normalizer(Normalizer.NFKC);
|
||||
nfkd = new Normalizer(Normalizer.NFKD);
|
||||
|
||||
String x = UTF32.valueOf32(0x10000);
|
||||
check("NFC", nfc, x);
|
||||
check("NFD", nfd, x);
|
||||
check("NFKC", nfkc, x);
|
||||
check("NFKD", nfkd, x);
|
||||
|
||||
|
||||
out = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream("NormalizationTestLog.txt"),
|
||||
"UTF8"),
|
||||
32*1024));
|
||||
|
||||
in = new BufferedReader (
|
||||
new FileReader (DIR + "NormalizationTest.txt"),
|
||||
32*1024);
|
||||
|
||||
try {
|
||||
String[] parts = new String[10];
|
||||
|
||||
System.out.println("Checking files");
|
||||
|
||||
int count = 0;
|
||||
|
||||
while (true) {
|
||||
String line = in.readLine();
|
||||
if ((count++ & 0x3FF) == 0) System.out.println("#LINE: " + line);
|
||||
if (line == null) break;
|
||||
originalLine = line;
|
||||
int pos = line.indexOf('#');
|
||||
if (pos >= 0) {
|
||||
line = line.substring(0,pos);
|
||||
}
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
|
||||
int splitCount = Utility.split(line, ';', parts);
|
||||
// FIX check splitCount
|
||||
for (int i = 0; i < splitCount; ++i) {
|
||||
parts[i] = Utility.fromHex(parts[i]);
|
||||
}
|
||||
|
||||
if (UTF32.length32(parts[0]) == 1) {
|
||||
int code = UTF32.char32At(parts[0],0);
|
||||
charsListed.set(code);
|
||||
if ((code & 0x3FF) == 0) System.out.println("# " + Utility.hex(code));
|
||||
}
|
||||
|
||||
// c2 == NFC(c1) == NFC(c2) == NFC(c3)
|
||||
errorCount += check("NFCa", nfc, parts[1], parts[0]);
|
||||
errorCount += check("NFCb", nfc, parts[1], parts[1]);
|
||||
errorCount += check("NFCc", nfc, parts[1], parts[2]);
|
||||
|
||||
// c4 == NFC(c4) == NFC(c5)
|
||||
errorCount += check("NFCd", nfc, parts[3], parts[3]);
|
||||
errorCount += check("NFCe", nfc, parts[3], parts[4]);
|
||||
|
||||
// c3 == NFD(c1) == NFD(c2) == NFD(c3)
|
||||
errorCount += check("NFDa", nfd, parts[2], parts[0]);
|
||||
errorCount += check("NFDb", nfd, parts[2], parts[1]);
|
||||
errorCount += check("NFDc", nfd, parts[2], parts[2]);
|
||||
|
||||
// c5 == NFD(c4) == NFD(c5)
|
||||
errorCount += check("NFDd", nfd, parts[4], parts[3]);
|
||||
errorCount += check("NFDe", nfd, parts[4], parts[4]);
|
||||
|
||||
// c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
|
||||
errorCount += check("NFKCa", nfkc, parts[3], parts[0]);
|
||||
errorCount += check("NFKCb", nfkc, parts[3], parts[1]);
|
||||
errorCount += check("NFKCc", nfkc, parts[3], parts[2]);
|
||||
errorCount += check("NFKCd", nfkc, parts[3], parts[3]);
|
||||
errorCount += check("NFKCe", nfkc, parts[3], parts[4]);
|
||||
|
||||
// c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
|
||||
errorCount += check("NFKDa", nfkd, parts[4], parts[0]);
|
||||
errorCount += check("NFKDb", nfkd, parts[4], parts[1]);
|
||||
errorCount += check("NFKDc", nfkd, parts[4], parts[2]);
|
||||
errorCount += check("NFKDd", nfkd, parts[4], parts[3]);
|
||||
errorCount += check("NFKDe", nfkd, parts[4], parts[4]);
|
||||
}
|
||||
System.out.println("Total errors in file: " + errorCount
|
||||
+ ", lines: " + lineErrorCount);
|
||||
errorCount = lineErrorCount = 0;
|
||||
|
||||
System.out.println("Checking Missing");
|
||||
checkMissing();
|
||||
System.out.println("Total errors in unlisted items: " + errorCount
|
||||
+ ", lines: " + lineErrorCount);
|
||||
|
||||
} finally {
|
||||
if (in != null) in.close();
|
||||
if (out != null) out.close();
|
||||
}
|
||||
}
|
||||
|
||||
static String lastBase = "";
|
||||
|
||||
public static int check(String type, Normalizer n, String base, String other) {
|
||||
try {
|
||||
String trans = n.normalize(other);
|
||||
if (!trans.equals(base)) {
|
||||
String temp = "";
|
||||
if (!lastLine.equals(originalLine)) {
|
||||
temp = "// " + originalLine;
|
||||
lastLine = originalLine;
|
||||
}
|
||||
if (!base.equals(lastBase)) {
|
||||
lastBase = base;
|
||||
lineErrorCount++;
|
||||
}
|
||||
String otherList = "";
|
||||
if (!base.equals(other)) {
|
||||
otherList = "(" + ucd.getCodeAndName(other) + ")";
|
||||
}
|
||||
out.println("DIFF " + type + ": "
|
||||
+ ucd.getCodeAndName(base) + " != "
|
||||
+ type
|
||||
+ otherList
|
||||
+ " == " + ucd.getCodeAndName(trans)
|
||||
+ temp
|
||||
);
|
||||
return 1;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("DIFF " + type + ": "
|
||||
+ ucd.getCodeAndName(base) + " != "
|
||||
+ type + "(" + ucd.getCodeAndName(other) + ")", new Object[]{}, e);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public static int check(String type, Normalizer n, String base) {
|
||||
return check(type, n, base, base);
|
||||
}
|
||||
|
||||
static void checkMissing() {
|
||||
for (int missing = 0; missing < 0x100000; ++missing) {
|
||||
if ((missing & 0xFFF) == 0) System.out.println("# " + Utility.hex(missing));
|
||||
if (charsListed.get(missing)) continue;
|
||||
String x = UTF32.valueOf32(missing);
|
||||
errorCount += check("NFC", nfc, x);
|
||||
errorCount += check("NFD", nfd, x);
|
||||
errorCount += check("NFKC", nfkc, x);
|
||||
errorCount += check("NFKD", nfkd, x);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
226
tools/unicodetools/com/ibm/text/UCD/UCD-in-XML-Notes.htm
Normal file
226
tools/unicodetools/com/ibm/text/UCD/UCD-in-XML-Notes.htm
Normal file
|
@ -0,0 +1,226 @@
|
|||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
|
||||
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
|
||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
||||
<title>Unicode Character Database</title>
|
||||
<style>
|
||||
<!--
|
||||
table { padding: 4 }
|
||||
td { padding: 4 }
|
||||
-->
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<span class="cb" id style="DISPLAY: block">
|
||||
<h1 align="center">Unicode Character Database (UCD) in XML Format</h1>
|
||||
<h1 align="center"><b><font color="#FF0000">WARNING: FORMAT IS DRAFT!</font></b></h1>
|
||||
<p align="center">MD 2000.10.16</p>
|
||||
<table border="1" width="40%" align="right" cellspacing="4" cellpadding="0">
|
||||
<tr>
|
||||
<td width="100%" bgcolor="#C0C0C0"><span class="cb" id
|
||||
style="DISPLAY: block">
|
||||
<h4 align="center">Using Internet Explorer</h4>
|
||||
<p>The UCD-Main.xml file can be read in Internet Explorer (5.0 and above).
|
||||
However:</p>
|
||||
<ul>
|
||||
<li>It may take a few minutes to load completely.</li>
|
||||
<li>The XML parser in IE does not appear to be conformant: it seems to
|
||||
break on</span> the following valid code points (and others):
|
||||
<ul>
|
||||
<li><IEbugs<br>
|
||||
c1='&#xFFF9;'<br>
|
||||
c2='&#xFFFA;'<br>
|
||||
c3='&#xFFFB;'<br>
|
||||
c4='&#xFFFC;'<br>
|
||||
c5='&#xFFFD;'<br>
|
||||
c6='&#xF0000;'<br>
|
||||
c7='&#xFFFFD;'<br>
|
||||
c8='&#x100000;'<br>
|
||||
c9='&#x10FFFD;'/></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
<p><a href="UCD-Main.xml">UCD-Main.xml</a> provides an XML format for the main
|
||||
files in the Unicode Character Database. These include:</p>
|
||||
<ul>
|
||||
<li><code>UnicodeData.txt</code></li>
|
||||
<li><code>ArabicShaping.txt</code></li>
|
||||
<li><code>Jamo.txt</code></li>
|
||||
<li><code>SpecialCasing.txt</code></li>
|
||||
<li><code>CompositionExclusions.txt</code></li>
|
||||
<li><code>EastAsianWidth.txt</code></li>
|
||||
<li><code>LineBreak.txt</code></li>
|
||||
<li><code>BidiMirroring.txt</code></li>
|
||||
<li><code>CaseFolding.txt</code></li>
|
||||
<li><code>Blocks.txt</code></li>
|
||||
<li><code>PropList.alpha.txt</code></li>
|
||||
</ul>
|
||||
<p>Other files in the UCD have very different structure or purpose, and are best
|
||||
expressed with separate files. Some annotational data, such as that in
|
||||
NamesList.txt or the 10646 comment in UnicodeData, is also best served with
|
||||
separate files. The current UCD files not yet in XML format are:</p>
|
||||
<ul>
|
||||
<li><code>Unihan.txt</code></li>
|
||||
<li><code>NamesList.txt</code></li>
|
||||
<li><code>Index.txt</code></li>
|
||||
<li><code>NormalizationTest.txt</code></li>
|
||||
</ul>
|
||||
<h3>Format</h3>
|
||||
<p>The Unicode blocks are provided as a list of <block .../> elements,
|
||||
with attributes providing the start, end, and name.</p>
|
||||
<p>Each assigned code point is a <e .../> element, with attributes
|
||||
supplying specific properties. The meaning of the attributes is specified below.
|
||||
There is one exception: large ranges of code points for characters such as
|
||||
Hangul Syllables are abbreviated by indicating the start and end of the range.</p>
|
||||
<p>Because of the volume of data, the attribute names are abbreviated. A <a
|
||||
href="#AttributeAbbreviations">key</a> explains the abbreviations, and relates
|
||||
them to the fields and values of the original UCD semicolon-delimited files.
|
||||
With few exceptions, the values in the XML are directly copied from data in the
|
||||
original UCD semicolon-delimited files. Those exceptions are described <a
|
||||
href="http://www.unicode.org/Public/3.0-Update1/UnicodeCharacterDatabase-3.0.1.html#DataModifications">below</a>.</p>
|
||||
<p>Numeric character references (NCRs) are used to encode the Unicode code
|
||||
points. Some Unicode code points cannot be transmitted in XML, even as NCRs (see
|
||||
<a href="http://www.w3.org/TR/REC-xml#charsets">http://www.w3.org/TR/REC-xml#charsets</a>),
|
||||
or would not be visibly distinct (TAB, CR, LF) in the data. Such code points are
|
||||
represented by '#xX;', where X is a hex number.</p>
|
||||
<h3><a name="AttributeAbbreviations">Attribute Abbreviations</a></h3>
|
||||
<p>To reduce the size of the document, the following attribute abbreviations are
|
||||
used. If an attribute is missing, that means it gets a default value. The
|
||||
defaults are listed in parentheses below. If there is no specific default, then
|
||||
a missing attribute should be read as N/A (not applicable). A default with '='
|
||||
means the default is the value of another other field (recursively!). Thus if
|
||||
the titlecase attribute is missing, then the value is the same as the uppercase.
|
||||
If that in turn is missing, then the value is the same as the code point itself.</p>
|
||||
<p>For a description of the source files, see <a
|
||||
href="http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html">UnicodeCharacterDatabase.html</a>.
|
||||
That file also has links to the descriptions of the fields within the files.
|
||||
Since the PropList values are so long, they will probably also be abbreviated in
|
||||
the future.</p>
|
||||
<table border="1" width="100%">
|
||||
<tr>
|
||||
<td width="50%" valign="top"><span class="cb" id style="DISPLAY: block">
|
||||
<h4>UnicodeData</h4>
|
||||
<p> c: code point<br>
|
||||
n: name<br>
|
||||
gc: general category (Lo)<br>
|
||||
cc: combining class (0)<br>
|
||||
bc: bidi category (L)<br>
|
||||
dm: decomposition mapping<br>
|
||||
dt: decomposition type (canonical)<br>
|
||||
nt: numeric type<br>
|
||||
nv: numeric value<br>
|
||||
bm: bidi mirrored (N)<br>
|
||||
uc: uppercase (=c)<br>
|
||||
lc: lowercase (=c)<br>
|
||||
tc: titlecase (=uc)</p>
|
||||
<h4>SpecialCasing:</h4>
|
||||
<p> sl: special lower (=lc)<br>
|
||||
su: special upper (=uc)<br>
|
||||
st: special title (=su)<br>
|
||||
sc: special case condition</p>
|
||||
<h4>CaseFolding:</h4>
|
||||
<p> fc: foldcase (=sl)</span></td>
|
||||
<td width="50%" valign="top"><span class="cb" id style="DISPLAY: block">
|
||||
<h4>CompositionExclusions:</h4>
|
||||
<p> ce: composition exclusion (N)</p>
|
||||
<h4>EastAsianWidth:</h4>
|
||||
<p> ea: east asian width (N)</p>
|
||||
<h4>Jamo:</h4>
|
||||
<p> jn: jamo name</p>
|
||||
<h4>LineBreak:</h4>
|
||||
<p> lb: line break class (AL)</p>
|
||||
<h4>ArabicShaping:</h4>
|
||||
<p> jt: joining type<br>
|
||||
jg: joining group</p>
|
||||
<h4>BidiMirroring:</h4>
|
||||
<p> bg: bidi mirroring glyph (=c)</p>
|
||||
<p><b>PropList:</b></p>
|
||||
<p> xs: space-delimited list of properties from the file</p>
|
||||
<p><b><i>WARNING: these values are likely to change!</i></b></span></td>
|
||||
</tr>
|
||||
</table>
|
||||
<br>
|
||||
<h3><a name="DataModifications">Data Modifications</a></h3>
|
||||
</span>
|
||||
<p>The XML format is generated from the original semicolon-delimited UCD files.
|
||||
In general, all fields and values are direct copies. However, there are some
|
||||
changes, detailed below.</p>
|
||||
<h4>1. Some redundant or annotational fields are omitted</h4>
|
||||
<table border="1" width="100%">
|
||||
<tr>
|
||||
<td width="50%" valign="top"><b>UnicodeData<br>
|
||||
</b>1.0 Name<br>
|
||||
10646 comment<br>
|
||||
<br>
|
||||
<b>CaseFolding<br>
|
||||
</b>Type (since it is computable from whether the fold equals the normal
|
||||
lowercase)
|
||||
<p><b>ArabicShaping<br>
|
||||
</b>Name<br>
|
||||
<br>
|
||||
<b>EastAsianWidth<br>
|
||||
</b>Name<br>
|
||||
<br>
|
||||
<b>LineBreak<br>
|
||||
</b>Name</p>
|
||||
</td>
|
||||
<td width="50%" valign="top"><b>PropList</b><font face="Times New Roman"
|
||||
color="#000000">
|
||||
<p>The fields are based on the proposed PropList.alpha, which changes the
|
||||
fields considerably.</p>
|
||||
</font>
|
||||
<p><span class="cb" id style="display: block"><b><i>WARNING: other values
|
||||
are also likely to change!</i></b></span></p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
<h4>2. Some fields are broken into several fields; others may be combined into a
|
||||
single field</h4>
|
||||
<ul>
|
||||
<li><b>dt: </b>decomposition tag
|
||||
<ul>
|
||||
<li>the 'tag' field extracted from the decomposition mapping. If there is
|
||||
no tag, the value is "canonical". Only has meaning if there is
|
||||
a decomposition (<b>dm</b>).</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><b>nt: </b>numeric type
|
||||
<ul>
|
||||
<li>an enumeration [decimal, digit, numeric] for the type of number. It
|
||||
replaces having duplicate field values for numbers</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><b>rg: </b>range
|
||||
<ul>
|
||||
<li>used for ranges of values that share characteristics, instead of
|
||||
having to do a substring check.<br>
|
||||
"START" corresponds to "<..., First>"<br>
|
||||
"END" corresponds to "<..., Last>"</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><b>nc: </b>name computed
|
||||
<ul>
|
||||
<li>if "COMPUTED", indicates that the name must be computed:
|
||||
e.g. Hangul Syllables, Ideographs</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><b>na: </b>name annotation
|
||||
<ul>
|
||||
<li>used for code points that do not really have associated names, like
|
||||
control characters and private use characters. The data in that case is
|
||||
either extracted from the "<...>" style name in the old
|
||||
format, or gotten from the "1.0 Unicode name".</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
974
tools/unicodetools/com/ibm/text/UCD/UCD.java
Normal file
974
tools/unicodetools/com/ibm/text/UCD/UCD.java
Normal file
|
@ -0,0 +1,974 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.BitSet;
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.FileInputStream;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
|
||||
public final class UCD implements UCD_Types {
|
||||
/**
|
||||
* Used for the default version.
|
||||
*/
|
||||
public static final String latestVersion = "3.1.1";
|
||||
|
||||
/**
|
||||
* Create singleton instance for default (latest) version
|
||||
*/
|
||||
public static UCD make() {
|
||||
return make("");
|
||||
}
|
||||
|
||||
/**
|
||||
* Create singleton instance for the specific version
|
||||
*/
|
||||
public static UCD make(String version) {
|
||||
if (version == null || version.length() == 0) version = latestVersion;
|
||||
UCD result = (UCD)versionCache.get(version);
|
||||
if (result == null) {
|
||||
result = new UCD();
|
||||
result.fillFromFile(version);
|
||||
versionCache.put(version, result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the version of the UCD
|
||||
*/
|
||||
public String getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the date that the data was parsed
|
||||
*/
|
||||
public long getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is the code point allocated?
|
||||
*/
|
||||
public boolean isAllocated(int codePoint) {
|
||||
if (getCategory(codePoint) != Cn) return true;
|
||||
if ((codePoint & 0xFFFE) == 0xFFFE) {
|
||||
if (major < 2 && codePoint > 0xFFFF) return false;
|
||||
return true; // Noncharacter
|
||||
}
|
||||
if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF && major >= 3 && minor >= 1) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is the code point assigned to a character (or surrogate)
|
||||
*/
|
||||
public boolean isAssigned(int codePoint) {
|
||||
return getCategory(codePoint) != Cn;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is the code point a PUA character (fast check)
|
||||
*/
|
||||
public boolean isPUA(int codePoint) {
|
||||
return (codePoint >= 0xE000 && codePoint < 0xF900
|
||||
|| codePoint >= 0xF0000 && codePoint < 0xFFFFE
|
||||
|| codePoint >= 0x100000 && codePoint < 0x10FFFE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Many ranges are elided in the UCD. All but the first are not actually
|
||||
* represented in the data internally. This detects such cases.
|
||||
*/
|
||||
public boolean isRepresented(int codePoint) {
|
||||
return getRaw(codePoint) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return XML version of the data associated with the code point.
|
||||
*/
|
||||
public String toString(int codePoint) {
|
||||
return get(codePoint, true).toString(FULL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the character name.
|
||||
*/
|
||||
public String getName(int codePoint) {
|
||||
return get(codePoint, true).name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the character names for the code points in a string, separated by ", "
|
||||
*/
|
||||
public String getName(String s) {
|
||||
if (s.length() == 1) return get(s.charAt(0), true).name;
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (i > 0) result.append(", ");
|
||||
result.append(getName(cp));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the code in U+ notation
|
||||
*/
|
||||
public static String getCode(int codePoint) {
|
||||
return "U+" + Utility.hex(codePoint);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the code in U+ notation
|
||||
*/
|
||||
public static String getCode(String s) {
|
||||
if (s.length() == 1) return getCode(s.charAt(0)); // fast path
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (i > 0) result.append(", ");
|
||||
result.append(getCode(cp));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the name and number (U+xxxx NAME) for a code point
|
||||
*/
|
||||
public String getCodeAndName(int codePoint) {
|
||||
return getCode(codePoint) + " " + getName(codePoint);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the name and number (U+xxxx NAME) for the code points in a string,
|
||||
* separated by ", "
|
||||
*/
|
||||
public String getCodeAndName(String s) {
|
||||
if (s == null || s.length() == 0) return "NULL";
|
||||
if (s.length() == 1) return getCodeAndName(s.charAt(0)); // fast path
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (i > 0) result.append(", ");
|
||||
result.append(getCodeAndName(cp));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the general category
|
||||
*/
|
||||
public byte getCategory(int codePoint) {
|
||||
return get(codePoint, false).generalCategory;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the main category, as a mask
|
||||
*/
|
||||
public static int mainCategoryMask(byte cat) {
|
||||
switch (cat) {
|
||||
case Lu: case Ll: case Lt: case Lm: case Lo: return LETTER_MASK;
|
||||
case Mn: case Me: case Mc: return MARK_MASK;
|
||||
case Nd: case Nl: case No: return NUMBER_MASK;
|
||||
case Zs: case Zl: case Zp: return SEPARATOR_MASK;
|
||||
case Cc: case Cf: case Cs: case Co: return CONTROL_MASK;
|
||||
case Pc: case Pd: case Ps: case Pe: case Po: case Pi: case Pf: return PUNCTUATION_MASK;
|
||||
case Sm: case Sc: case Sk: case So: return SYMBOL_MASK;
|
||||
case Cn: return UNASSIGNED_MASK;
|
||||
}
|
||||
throw new IllegalArgumentException ("Illegal General Category " + cat);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the combining class, a number between zero and 255. Returned
|
||||
* as a short to avoid the signed-byte problem in Java
|
||||
*/
|
||||
public short getCombiningClass(int codePoint) {
|
||||
return (short)(get(codePoint, false).combiningClass & 0xFF);
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this combining class actually occur in this version of the data.
|
||||
*/
|
||||
public boolean isCombiningClassUsed(byte value) {
|
||||
return combiningClassSet.get(0xFF & value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the bidi class
|
||||
*/
|
||||
public byte getBidiClass(int codePoint) {
|
||||
return get(codePoint, false).bidiClass;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the RAW decomposition mapping. Must be used recursively for the full mapping!
|
||||
*/
|
||||
public String getDecompositionMapping(int codePoint) {
|
||||
return get(codePoint, true).decompositionMapping;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get BIDI mirroring character, if there is one.
|
||||
*/
|
||||
public String getBidiMirror(int codePoint) {
|
||||
return get(codePoint, true).bidiMirror;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the RAW decomposition type: the <...> field in the UCD data.
|
||||
*/
|
||||
public byte getDecompositionType(int codePoint) {
|
||||
return get(codePoint, false).decompositionType;
|
||||
}
|
||||
|
||||
public float getNumericValue(int codePoint) {
|
||||
return get(codePoint, false).numericValue;
|
||||
}
|
||||
|
||||
public byte getNumericType(int codePoint) {
|
||||
return get(codePoint, false).numericType;
|
||||
}
|
||||
|
||||
public String getCase(int codePoint, byte simpleVsFull, byte caseType) {
|
||||
return getCase(codePoint, simpleVsFull, caseType, "");
|
||||
}
|
||||
|
||||
public String getCase(String s, byte simpleVsFull, byte caseType) {
|
||||
return getCase(s, simpleVsFull, caseType, "");
|
||||
}
|
||||
|
||||
public String getCase(int codePoint, byte simpleVsFull, byte caseType, String condition) {
|
||||
UData udata = get(codePoint, true);
|
||||
if (caseType < LOWER || caseType > FOLD
|
||||
|| (simpleVsFull != SIMPLE && simpleVsFull != FULL)) {
|
||||
throw new IllegalArgumentException("simpleVsFull or caseType out of bounds");
|
||||
}
|
||||
if (caseType < FOLD) {
|
||||
if (simpleVsFull == FULL && udata.specialCasing.length() != 0) {
|
||||
if (condition.length() == 0
|
||||
|| udata.specialCasing.indexOf(condition) < 0) {
|
||||
simpleVsFull = SIMPLE;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// special case. For these characters alone, use "I" as option meaning collapse to "i"
|
||||
//if (codePoint == 0x0131 || codePoint == 0x0130) { // special case turkish i
|
||||
if (getBinaryProperty(codePoint, CaseFoldTurkishI)) {
|
||||
if (!udata.specialCasing.equals("I")) simpleVsFull = SIMPLE;
|
||||
else simpleVsFull = FULL;
|
||||
}
|
||||
}
|
||||
|
||||
switch (caseType + simpleVsFull) {
|
||||
case SIMPLE + UPPER: return udata.simpleUppercase;
|
||||
case SIMPLE + LOWER: return udata.simpleLowercase;
|
||||
case SIMPLE + TITLE: return udata.simpleTitlecase;
|
||||
case SIMPLE + FOLD: return udata.simpleCaseFolding;
|
||||
case FULL + UPPER: return udata.fullUppercase;
|
||||
case FULL + LOWER: return udata.fullLowercase;
|
||||
case FULL + TITLE: return udata.fullTitlecase;
|
||||
case FULL + FOLD: return udata.fullCaseFolding;
|
||||
}
|
||||
throw new IllegalArgumentException("getCase: " + caseType + ", " + simpleVsFull);
|
||||
}
|
||||
|
||||
public String getCase(String s, byte simpleVsFull, byte caseType, String condition) {
|
||||
if (UTF32.length32(s) == 1) return getCase(UTF32.char32At(s, 0), simpleVsFull, caseType);
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
byte currentCaseType = caseType;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
String mappedVersion = getCase(cp, simpleVsFull, currentCaseType, condition);
|
||||
result.append(mappedVersion);
|
||||
if (caseType == TITLE) {
|
||||
// if letter is cased, change to lowercase, otherwise change to TITLE
|
||||
byte cat = getCategory(cp);
|
||||
if (cat == Mn || cat == Me || cat == Mc) {
|
||||
// ignore!
|
||||
} else if (cat == Lu || cat == Ll || cat == Lt
|
||||
|| getBinaryProperty(cp, Other_Lowercase)
|
||||
|| getBinaryProperty(cp, Other_Uppercase)) {
|
||||
currentCaseType = LOWER;
|
||||
} else {
|
||||
currentCaseType = TITLE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/*
|
||||
public String getSimpleLowercase(int codePoint) {
|
||||
return get(codePoint, true).simpleLowercase;
|
||||
}
|
||||
|
||||
public String getSimpleUppercase(int codePoint) {
|
||||
return get(codePoint, true).simpleUppercase;
|
||||
}
|
||||
|
||||
public String getSimpleTitlecase(int codePoint) {
|
||||
return get(codePoint, true).simpleTitlecase;
|
||||
}
|
||||
|
||||
public String getSimpleCaseFolding(int codePoint) {
|
||||
return get(codePoint, true).simpleCaseFolding;
|
||||
}
|
||||
|
||||
public String getFullLowercase(int codePoint) {
|
||||
return get(codePoint, true).fullLowercase;
|
||||
}
|
||||
|
||||
public String getFullUppercase(int codePoint) {
|
||||
return get(codePoint, true).fullUppercase;
|
||||
}
|
||||
|
||||
public String getFullTitlecase(int codePoint) {
|
||||
return get(codePoint, true).fullTitlecase;
|
||||
}
|
||||
|
||||
public String getFullCaseFolding(int codePoint) {
|
||||
return get(codePoint, true).simpleCaseFolding;
|
||||
}
|
||||
|
||||
public String getLowercase(int codePoint, boolean full) {
|
||||
if (full) return getFullLowercase(codePoint);
|
||||
return getSimpleLowercase(codePoint);
|
||||
}
|
||||
|
||||
public String getUppercase(int codePoint, boolean full) {
|
||||
if (full) return getFullUppercase(codePoint);
|
||||
return getSimpleLowercase(codePoint);
|
||||
}
|
||||
|
||||
public String getTitlecase(int codePoint, boolean full) {
|
||||
if (full) return getFullTitlecase(codePoint);
|
||||
return getSimpleTitlecase(codePoint);
|
||||
}
|
||||
|
||||
public String getCaseFolding(int codePoint, boolean full) {
|
||||
if (full) return getFullCaseFolding(codePoint);
|
||||
return getSimpleCaseFolding(codePoint);
|
||||
}
|
||||
|
||||
public String getLowercase(String s, boolean full) {
|
||||
if (s.length() == 1) return getLowercase(s.charAt(0), true);
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (i > 0) result.append(", ");
|
||||
result.append(getLowercase(cp, true));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public String getUppercase(String s, boolean full) {
|
||||
if (s.length() == 1) return getUppercase(s.charAt(0), true);
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (i > 0) result.append(", ");
|
||||
result.append(getUppercase(cp, true));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public String getTitlecase(String s, boolean full) {
|
||||
if (s.length() == 1) return getTitlecase(s.charAt(0), true);
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (i > 0) result.append(", ");
|
||||
result.append(getTitlecase(cp, true));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public String getCaseFolding(String s, boolean full) {
|
||||
if (s.length() == 1) return getCaseFolding(s.charAt(0), true);
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (i > 0) result.append(", ");
|
||||
result.append(getCaseFolding(cp, true));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
*/
|
||||
|
||||
public String getSpecialCase(int codePoint) {
|
||||
return get(codePoint, true).specialCasing;
|
||||
}
|
||||
|
||||
public byte getEastAsianWidth(int codePoint) {
|
||||
return get(codePoint, false).eastAsianWidth;
|
||||
}
|
||||
|
||||
public byte getLineBreak(int codePoint) {
|
||||
return get(codePoint, false).lineBreak;
|
||||
}
|
||||
|
||||
public byte getScript(int codePoint) {
|
||||
return get(codePoint, false).script;
|
||||
}
|
||||
|
||||
public byte getAge(int codePoint) {
|
||||
return get(codePoint, false).age;
|
||||
}
|
||||
|
||||
public byte getJoiningType(int codePoint) {
|
||||
return get(codePoint, false).joiningType;
|
||||
}
|
||||
|
||||
public byte getJoiningGroup(int codePoint) {
|
||||
return get(codePoint, false).joiningGroup;
|
||||
}
|
||||
|
||||
public int getBinaryProperties(int codePoint) {
|
||||
return get(codePoint, false).binaryProperties;
|
||||
}
|
||||
|
||||
public boolean getBinaryProperty(int codePoint, int bit) {
|
||||
return (get(codePoint, false).binaryProperties & (1<<bit)) != 0;
|
||||
}
|
||||
|
||||
// ENUM Mask Utilties
|
||||
|
||||
public int getCategoryMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).generalCategory;
|
||||
}
|
||||
|
||||
public int getBidiClassMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).bidiClass;
|
||||
}
|
||||
|
||||
public int getNumericTypeMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).numericType;
|
||||
}
|
||||
|
||||
public int getDecompositionTypeMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).decompositionType;
|
||||
}
|
||||
|
||||
public int getEastAsianWidthMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).eastAsianWidth;
|
||||
}
|
||||
|
||||
public int getLineBreakMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).lineBreak;
|
||||
}
|
||||
|
||||
public int getScriptMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).script;
|
||||
}
|
||||
|
||||
public int getAgeMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).age;
|
||||
}
|
||||
|
||||
public int getJoiningTypeMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).joiningType;
|
||||
}
|
||||
|
||||
public int getJoiningGroupMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).joiningGroup;
|
||||
}
|
||||
|
||||
|
||||
// VERSIONS WITH NAMES
|
||||
|
||||
public String getCategoryID(int codePoint) {
|
||||
return getCategoryID_fromIndex(getCategory(codePoint));
|
||||
}
|
||||
|
||||
public static String getCategoryID_fromIndex(byte prop) {
|
||||
return UCD_Names.GC[prop];
|
||||
}
|
||||
|
||||
public String getBidiClassID(int codePoint) {
|
||||
return getBidiClassID_fromIndex(getBidiClass(codePoint));
|
||||
}
|
||||
|
||||
public static String getBidiClassID_fromIndex(byte prop) {
|
||||
return UCD_Names.BC[prop];
|
||||
}
|
||||
|
||||
public String getCombiningClassID(int codePoint) {
|
||||
return getCombiningClassID_fromIndex(getCombiningClass(codePoint));
|
||||
}
|
||||
|
||||
public static String getCombiningClassID_fromIndex(short cc) {
|
||||
return cc + "";
|
||||
}
|
||||
|
||||
public String getDecompositionTypeID(int codePoint) {
|
||||
return getDecompositionTypeID_fromIndex(getDecompositionType(codePoint));
|
||||
}
|
||||
|
||||
public static String getDecompositionTypeID_fromIndex(byte prop) {
|
||||
return UCD_Names.DT[prop];
|
||||
}
|
||||
|
||||
public String getNumericTypeID(int codePoint) {
|
||||
return getNumericTypeID_fromIndex(getNumericType(codePoint));
|
||||
}
|
||||
|
||||
public static String getNumericTypeID_fromIndex(byte prop) {
|
||||
return UCD_Names.NT[prop];
|
||||
}
|
||||
|
||||
public String getEastAsianWidthID(int codePoint) {
|
||||
return getEastAsianWidthID_fromIndex(getEastAsianWidth(codePoint));
|
||||
}
|
||||
|
||||
public static String getEastAsianWidthID_fromIndex(byte prop) {
|
||||
return UCD_Names.EA[prop];
|
||||
}
|
||||
|
||||
public String getLineBreakID(int codePoint) {
|
||||
return getLineBreakID_fromIndex(getLineBreak(codePoint));
|
||||
}
|
||||
|
||||
public static String getLineBreakID_fromIndex(byte prop) {
|
||||
return UCD_Names.LB[prop];
|
||||
}
|
||||
|
||||
public String getJoiningTypeID(int codePoint) {
|
||||
return getJoiningTypeID_fromIndex(getJoiningType(codePoint));
|
||||
}
|
||||
|
||||
public static String getJoiningTypeID_fromIndex(byte prop) {
|
||||
return UCD_Names.JOINING_TYPE[prop];
|
||||
}
|
||||
|
||||
public String getJoiningGroupID(int codePoint) {
|
||||
return getJoiningGroupID_fromIndex(getJoiningGroup(codePoint));
|
||||
}
|
||||
|
||||
public static String getJoiningGroupID_fromIndex(byte prop) {
|
||||
return UCD_Names.JOINING_GROUP[prop];
|
||||
}
|
||||
|
||||
public String getScriptID(int codePoint) {
|
||||
return getScriptID_fromIndex(getScript(codePoint));
|
||||
}
|
||||
|
||||
public static String getScriptID_fromIndex(byte prop) {
|
||||
return UCD_Names.SCRIPT[prop];
|
||||
}
|
||||
|
||||
public String getAgeID(int codePoint) {
|
||||
return getAgeID_fromIndex(getAge(codePoint));
|
||||
}
|
||||
|
||||
public static String getAgeID_fromIndex(byte prop) {
|
||||
return UCD_Names.AGE[prop];
|
||||
}
|
||||
|
||||
public String getBinaryPropertiesID(int codePoint, byte bit) {
|
||||
return (getBinaryProperties(codePoint) & (1<<bit)) != 0 ? "Y" : "N";
|
||||
}
|
||||
|
||||
public static String getBinaryPropertiesID_fromIndex(byte bit) {
|
||||
return UCD_Names.BP[bit];
|
||||
}
|
||||
|
||||
public static int mapToRepresentative(int ch, boolean old) {
|
||||
if (ch <= 0xFFFD) {
|
||||
//if (ch <= 0x2800) return ch;
|
||||
//if (ch <= 0x28FF) return 0x2800; // braille
|
||||
if (ch <= 0x3400) return ch; // CJK Ideograph Extension A
|
||||
if (ch <= 0x4DB5) return 0x3400;
|
||||
if (ch <= 0x4E00) return ch; // CJK Ideograph
|
||||
if (ch <= 0x9FA5) return 0x4E00;
|
||||
if (ch <= 0xAC00) return ch; // Hangul Syllable
|
||||
if (ch <= 0xD7A3) return 0xAC00;
|
||||
if (ch <= 0xD800) return ch; // Non Private Use High Surrogate
|
||||
if (ch <= 0xDB7F) return 0xD800;
|
||||
if (ch <= 0xDB80) return ch; // Private Use High Surrogate
|
||||
if (ch <= 0xDBFF) return 0xDB80;
|
||||
if (ch <= 0xDC00) return ch; // Low Surrogate
|
||||
if (ch <= 0xDFFF) return 0xDC00;
|
||||
if (ch <= 0xE000) return ch; // Private Use
|
||||
if (ch <= 0xF8FF) return 0xE000;
|
||||
if (old) {
|
||||
if (ch <= 0xF900) return ch; // CJK Compatibility Ideograp
|
||||
if (ch <= 0xFA2D) return 0xF900;
|
||||
}
|
||||
if (ch < 0xFDD0) return ch; // Noncharacter
|
||||
if (ch <= 0xFDEF) return 0xFFFF;
|
||||
} else {
|
||||
if ((ch & 0xFFFE) == 0xFFFE) return 0xFFFF; // Noncharacter
|
||||
if (ch <= 0x20000) return ch; // Extension B
|
||||
if (ch <= 0x2A6D6) return 0x20000;
|
||||
//if (ch <= 0x2F800) return ch;
|
||||
//if (ch <= 0x2FA1D) return 0x2F800; // compat ideographs
|
||||
if (ch <= 0xF0000) return ch; // Plane 15 Private Use
|
||||
if (ch <= 0xFFFFD) return 0xF0000; // Plane 16 Private Use
|
||||
if (ch <= 0x100000) return ch; // Plane 15 Private Use
|
||||
if (ch <= 0x10FFFD) return 0x100000; // Plane 16 Private Use
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
|
||||
public boolean isIdentifierStart(int cp, boolean extended) {
|
||||
if (extended) {
|
||||
if (cp == 0x0E33 || cp == 0x0EB3 || cp == 0xFF9E || cp == 0xFF9F) return false;
|
||||
if (cp == 0x037A || cp >= 0xFC5E && cp <= 0xFC63 || cp == 0xFDFA || cp == 0xFDFB) return false;
|
||||
if (cp >= 0xFE70 && cp <= 0xFE7E && (cp & 1) == 0) return false;
|
||||
}
|
||||
byte cat = getCategory(cp);
|
||||
if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isIdentifierContinue_NO_Cf(int cp, boolean extended) {
|
||||
if (isIdentifierStart(cp, extended)) return true;
|
||||
if (extended) {
|
||||
if (cp == 0x00B7) return true;
|
||||
if (cp == 0x0E33 || cp == 0x0EB3 || cp == 0xFF9E || cp == 0xFF9F) return true;
|
||||
}
|
||||
byte cat = getCategory(cp);
|
||||
if (cat == Mn || cat == Mc || cat == Nd || cat == Pc) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isIdentifier(String s, boolean extended) {
|
||||
if (s.length() == 0) return false; // at least one!
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (i == 0) {
|
||||
if (!isIdentifierStart(cp, extended)) return false;
|
||||
} else {
|
||||
if (!isIdentifierContinue_NO_Cf(cp, extended)) return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/*
|
||||
Middle Dot. Because most Catalan legacy data will be encoded in Latin-1, U+00B7 MIDDLE DOT needs to be
|
||||
allowed in <identifier_extend>.
|
||||
|
||||
In particular, the following four characters should be in <identifier_extend> and not <identifier_start>:
|
||||
0E33 THAI CHARACTER SARA AM
|
||||
0EB3 LAO VOWEL SIGN AM
|
||||
FF9E HALFWIDTH KATAKANA VOICED SOUND MARK
|
||||
FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
|
||||
Irregularly decomposing characters. U+037A GREEK YPOGEGRAMMENI and certain Arabic presentation
|
||||
forms have irregular compatibility decompositions, and need to be excluded from both <identifier_start>
|
||||
and <identifier_extend>. It is recommended that all Arabic presentation forms be excluded from identifiers
|
||||
in any event, although only a few of them are required to be excluded for normalization
|
||||
to guarantee identifier closure.
|
||||
*/
|
||||
|
||||
// *******************
|
||||
// PRIVATES
|
||||
// *******************
|
||||
|
||||
// cache of singletons
|
||||
private static Map versionCache = new HashMap();
|
||||
|
||||
private static final int LIMIT_CODE_POINT = 0x110000;
|
||||
private static final UData[] ALL_NULLS = new UData[1024];
|
||||
|
||||
// main data
|
||||
private UData[][] data = new UData[LIMIT_CODE_POINT>>10][];
|
||||
|
||||
// extras
|
||||
private BitSet combiningClassSet = new BitSet(256);
|
||||
private String version;
|
||||
private String file;
|
||||
private long date = -1;
|
||||
private byte format = -1;
|
||||
private byte major = -1;
|
||||
private byte minor = -1;
|
||||
private byte update = -1;
|
||||
private int size = -1;
|
||||
|
||||
// cache last UData
|
||||
private int lastCode = Integer.MIN_VALUE;
|
||||
private UData lastResult = UData.UNASSIGNED;
|
||||
private boolean lastCodeFixed = false;
|
||||
|
||||
// hide constructor
|
||||
private UCD() {
|
||||
for (int i = 0; i < data.length; ++i) {
|
||||
data[i] = ALL_NULLS;
|
||||
}
|
||||
}
|
||||
|
||||
private void add(UData uData) {
|
||||
int high = uData.codePoint>>10;
|
||||
if (data[high] == ALL_NULLS) {
|
||||
UData[] temp = new UData[1024];
|
||||
data[high] = temp;
|
||||
}
|
||||
data[high][uData.codePoint & 0x3FF] = uData;
|
||||
}
|
||||
|
||||
public boolean hasComputableName(int codePoint) {
|
||||
if (codePoint >= 0xF900 && codePoint <= 0xFA2D) return true;
|
||||
int rangeStart = mapToRepresentative(codePoint, major < 2);
|
||||
switch (rangeStart) {
|
||||
default:
|
||||
return getRaw(codePoint) == null;
|
||||
case 0x2800: // braille
|
||||
case 0xF900: // compat ideos
|
||||
case 0x2F800: // compat ideos
|
||||
case 0x3400: // CJK Ideograph Extension A
|
||||
case 0x4E00: // CJK Ideograph
|
||||
case 0x20000: // Extension B
|
||||
case 0xAC00: // Hangul Syllable
|
||||
case 0xE000: // Private Use
|
||||
case 0xF0000: // Private Use
|
||||
case 0x100000: // Private Use
|
||||
case 0xD800: // Surrogate
|
||||
case 0xDB80: // Private Use
|
||||
case 0xDC00: // Private Use
|
||||
case 0xFFFF: // Noncharacter
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
private UData getRaw(int codePoint) {
|
||||
return data[codePoint>>10][codePoint & 0x3FF];
|
||||
}
|
||||
|
||||
// access data for codepoint
|
||||
UData get(int codePoint, boolean fixStrings) {
|
||||
//if (codePoint == lastCode && fixStrings <= lastCodeFixed) return lastResult;
|
||||
/*
|
||||
// we play some funny tricks for performance
|
||||
// if cp is not represented, it is either in a elided block or missing.
|
||||
// elided blocks are either CONTINUE or FFFF
|
||||
|
||||
byte cat;
|
||||
if (!ucdData.isRepresented(cp)) {
|
||||
int rep = UCD.mapToRepresentative(cp);
|
||||
if (rep == 0xFFFF) cat = Cn;
|
||||
else if (rep != cp) return CONTINUE;
|
||||
else if (!ucdData.isRepresented(rep)) cat = Cn;
|
||||
else cat = ucdData.getCategory(rep);
|
||||
} else {
|
||||
cat = ucdData.getCategory(cp);
|
||||
}
|
||||
*/
|
||||
|
||||
UData result = null;
|
||||
|
||||
// do range stuff
|
||||
String constructedName = null;
|
||||
int rangeStart = mapToRepresentative(codePoint, major < 2);
|
||||
boolean isHangul = false;
|
||||
switch (rangeStart) {
|
||||
case 0xF900:
|
||||
if (major < 2) {
|
||||
if (fixStrings) constructedName = "CJK COMPATIBILITY IDEOGRAPH-" + Utility.hex(codePoint, 4);
|
||||
break;
|
||||
}
|
||||
// FALL THROUGH!!!!
|
||||
default:
|
||||
result = getRaw(codePoint);
|
||||
if (result == null) {
|
||||
result = UData.UNASSIGNED;
|
||||
if (fixStrings) result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
|
||||
}
|
||||
return result;
|
||||
case 0x3400: // CJK Ideograph Extension A
|
||||
case 0x4E00: // CJK Ideograph
|
||||
case 0x20000: // Extension B
|
||||
if (fixStrings) constructedName = "CJK UNIFIED IDEOGRAPH-" + Utility.hex(codePoint, 4);
|
||||
break;
|
||||
case 0xAC00: // Hangul Syllable
|
||||
isHangul = true;
|
||||
if (fixStrings) {
|
||||
constructedName = "HANGUL SYLLABLE " + getHangulName(codePoint);
|
||||
}
|
||||
break;
|
||||
case 0xE000: // Private Use
|
||||
case 0xF0000: // Private Use
|
||||
case 0x100000: // Private Use
|
||||
if (fixStrings) constructedName = "<private use-" + Utility.hex(codePoint, 4) + ">";
|
||||
break;
|
||||
case 0xD800: // Surrogate
|
||||
case 0xDB80: // Private Use
|
||||
case 0xDC00: // Private Use
|
||||
if (fixStrings) constructedName = "<surrogate-" + Utility.hex(codePoint, 4) + ">";
|
||||
break;
|
||||
case 0xFFFF: // Noncharacter
|
||||
if (fixStrings) constructedName = "<noncharacter-" + Utility.hex(codePoint, 4) + ">";
|
||||
break;
|
||||
}
|
||||
result = getRaw(rangeStart);
|
||||
if (result == null) {
|
||||
result = UData.UNASSIGNED;
|
||||
if (fixStrings) result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
|
||||
return result;
|
||||
}
|
||||
|
||||
result.codePoint = codePoint;
|
||||
if (fixStrings) {
|
||||
result.name = constructedName;
|
||||
result.decompositionMapping = result.bidiMirror
|
||||
= result.simpleLowercase = result.simpleUppercase = result.simpleTitlecase = result.simpleCaseFolding
|
||||
= result.fullLowercase = result.fullUppercase = result.fullTitlecase = result.fullCaseFolding
|
||||
= UTF32.valueOf32(codePoint);
|
||||
}
|
||||
if (isHangul) {
|
||||
if (fixStrings) result.decompositionMapping = getHangulDecompositionPair(codePoint);
|
||||
result.decompositionType = CANONICAL;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Hangul constants
|
||||
|
||||
static final int
|
||||
SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
|
||||
LCount = 19, VCount = 21, TCount = 28,
|
||||
NCount = VCount * TCount, // 588
|
||||
SCount = LCount * NCount, // 11172
|
||||
LLimit = LBase + LCount, // 1113
|
||||
VLimit = VBase + VCount, // 1176
|
||||
TLimit = TBase + TCount, // 11C3
|
||||
SLimit = SBase + SCount; // D7A4
|
||||
|
||||
private static String getHangulName(int s) {
|
||||
int SIndex = s - SBase;
|
||||
if (0 > SIndex || SIndex >= SCount) {
|
||||
throw new IllegalArgumentException("Not a Hangul Syllable: " + s);
|
||||
}
|
||||
int LIndex = SIndex / NCount;
|
||||
int VIndex = (SIndex % NCount) / TCount;
|
||||
int TIndex = SIndex % TCount;
|
||||
// if (true) return "?";
|
||||
return UCD_Names.JAMO_L_TABLE[LIndex] + UCD_Names.JAMO_V_TABLE[VIndex] + UCD_Names.JAMO_T_TABLE[TIndex];
|
||||
}
|
||||
|
||||
private static final char[] pair = new char[2];
|
||||
|
||||
static String getHangulDecompositionPair(int ch) {
|
||||
int SIndex = ch - SBase;
|
||||
if (0 > SIndex || SIndex >= SCount) {
|
||||
return "";
|
||||
}
|
||||
int TIndex = SIndex % TCount;
|
||||
if (TIndex != 0) { // triple
|
||||
pair[0] = (char)(SBase + SIndex - TIndex);
|
||||
pair[1] = (char)(TBase + TIndex);
|
||||
} else {
|
||||
pair[0] = (char)(LBase + SIndex / NCount);
|
||||
pair[1] = (char)(VBase + (SIndex % NCount) / TCount);
|
||||
}
|
||||
return String.valueOf(pair);
|
||||
}
|
||||
|
||||
static int composeHangul(int char1, int char2) {
|
||||
if (LBase <= char1 && char1 < LLimit && VBase <= char2 && char2 < VLimit) {
|
||||
return (SBase + ((char1 - LBase) * VCount + (char2 - VBase)) * TCount);
|
||||
}
|
||||
if (SBase <= char1 && char1 < SLimit && TBase <= char2 && char2 < TLimit
|
||||
&& ((char1 - SBase) % TCount) == 0) {
|
||||
return char1 + (char2 - TBase);
|
||||
}
|
||||
return 0xFFFF; // no composition
|
||||
}
|
||||
|
||||
static boolean isTrailingJamo(int cp) {
|
||||
return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
|
||||
}
|
||||
|
||||
private void fillFromFile(String version) {
|
||||
DataInputStream dataIn = null;
|
||||
String fileName = BIN_DIR + "UCD_Data" + version + ".bin";
|
||||
int uDataFileCount = 0;
|
||||
try {
|
||||
dataIn = new DataInputStream(
|
||||
new BufferedInputStream(
|
||||
new FileInputStream(fileName),
|
||||
128*1024));
|
||||
// header
|
||||
format = dataIn.readByte();
|
||||
major = dataIn.readByte();
|
||||
minor = dataIn.readByte();
|
||||
update = dataIn.readByte();
|
||||
String foundVersion = major + "." + minor + "." + update;
|
||||
if (format != BINARY_FORMAT || !version.equals(foundVersion)) {
|
||||
throw new ChainException("Illegal data file format for {0}: {1}, {2}",
|
||||
new Object[]{version, new Byte(format), foundVersion});
|
||||
}
|
||||
date = dataIn.readLong();
|
||||
size = uDataFileCount = dataIn.readInt();
|
||||
|
||||
boolean didJoiningHack = false;
|
||||
|
||||
|
||||
// records
|
||||
for (int i = 0; i < uDataFileCount; ++i) {
|
||||
UData uData = new UData();
|
||||
uData.readBytes(dataIn);
|
||||
|
||||
if (uData.codePoint == 0x2801) {
|
||||
System.out.println("SPOT-CHECK: " + uData);
|
||||
}
|
||||
|
||||
//T = Mc + (Cf - ZWNJ - ZWJ)
|
||||
int cp = uData.codePoint;
|
||||
byte old = uData.joiningType;
|
||||
byte cat = uData.generalCategory;
|
||||
//if (cp == 0x200D) {
|
||||
// uData.joiningType = JT_C;
|
||||
//} else
|
||||
if (cp != 0x200D && cp != 0x200C && (cat == Mn || cat == Cf)) {
|
||||
uData.joiningType = JT_T;
|
||||
}
|
||||
if (!didJoiningHack && uData.joiningType != old) {
|
||||
System.out.println("HACK: Setting "
|
||||
+ UCD_Names.LONG_JOINING_TYPE[uData.joiningType]
|
||||
+ ": " + Utility.hex(cp) + " " + uData.name);
|
||||
didJoiningHack = true;
|
||||
}
|
||||
|
||||
combiningClassSet.set(uData.combiningClass & 0xFF);
|
||||
add(uData);
|
||||
}
|
||||
/*
|
||||
if (update == -1) {
|
||||
throw new ChainException("Data File truncated for ",
|
||||
new Object[]{version}, e);
|
||||
}
|
||||
if (size != fileSize) {
|
||||
throw new ChainException("Counts do not match: file {0}, records {1}",
|
||||
new Object[]{new Integer(fileSize), new Integer(size)});
|
||||
}
|
||||
*/
|
||||
// everything is ok!
|
||||
this.version = version;
|
||||
this.file = fileName;
|
||||
//+ " " + new File(fileName).lastModified();
|
||||
} catch (IOException e) {
|
||||
throw new ChainException("Can't read data file for {0}", new Object[]{version}, e);
|
||||
} finally {
|
||||
if (dataIn != null) {
|
||||
try {
|
||||
dataIn.close();
|
||||
} catch (IOException e) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
750
tools/unicodetools/com/ibm/text/UCD/UCD_Names.java
Normal file
750
tools/unicodetools/com/ibm/text/UCD/UCD_Names.java
Normal file
|
@ -0,0 +1,750 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
|
||||
final class UCD_Names implements UCD_Types {
|
||||
|
||||
static final String[] UNIFIED_PROPERTIES = {
|
||||
"General Category (listing UnicodeData.txt, field 2: see UnicodeData.html)",
|
||||
"Combining Class (listing UnicodeData.txt, field 3: see UnicodeData.html)",
|
||||
"Bidi Class (listing UnicodeData.txt, field 4: see UnicodeData.html)",
|
||||
"Decomposition Type (from UnicodeData.txt, field 5: see UnicodeData.html)",
|
||||
"Numeric Type (from UnicodeData.txt, field 6/7/8: see UnicodeData.html)",
|
||||
"East Asian Width (listing EastAsianWidth.txt, field 1)",
|
||||
"Line Break (listing LineBreak.txt, field 1)",
|
||||
"Joining Type (listing ArabicShaping.txt, field 1).\r\n"
|
||||
+ "#\tType T is derived from Mn + Cf - ZWNJ - ZWJ\r\n"
|
||||
+ "#\tAll other code points have the type U",
|
||||
"Joining Group (listing ArabicShaping.txt, field 2)",
|
||||
"BidiMirrored (listing UnicodeData.txt, field 9: see UnicodeData.html)",
|
||||
"Script",
|
||||
"Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)"
|
||||
};
|
||||
|
||||
static final String[] SHORT_UNIFIED_PROPERTIES = {
|
||||
"GeneralCategory",
|
||||
"CombiningClass",
|
||||
"BidiClass",
|
||||
"DecompositionType",
|
||||
"NumericType",
|
||||
"EastAsianWidth",
|
||||
"LineBreak",
|
||||
"JoiningType",
|
||||
"JoiningGroup",
|
||||
"Value",
|
||||
"Script",
|
||||
"Age"
|
||||
};
|
||||
|
||||
static final String[] ABB_UNIFIED_PROPERTIES = {
|
||||
"gc",
|
||||
"cc",
|
||||
"bc",
|
||||
"dt",
|
||||
"nt",
|
||||
"ea",
|
||||
"lb",
|
||||
"jt",
|
||||
"jg",
|
||||
"va",
|
||||
"sc",
|
||||
"Ag"
|
||||
};
|
||||
|
||||
|
||||
static final String[] BP = {
|
||||
"BidiMirrored",
|
||||
"CompositionExclusion",
|
||||
"White_Space",
|
||||
"NonBreak",
|
||||
"Bidi_Control",
|
||||
"Join_Control",
|
||||
"Dash",
|
||||
"Hyphen",
|
||||
"Quotation_Mark",
|
||||
"Terminal_Punctuation",
|
||||
"Other_Math",
|
||||
"Hex_Digit",
|
||||
"ASCII_Hex_Digit",
|
||||
"Other_Alphabetic",
|
||||
"Ideographic",
|
||||
"Diacritic",
|
||||
"Extender",
|
||||
"Other_Lowercase",
|
||||
"Other_Uppercase",
|
||||
"Noncharacter_Code_Point",
|
||||
"CaseFoldTurkishI",
|
||||
"Other_GraphemeExtend",
|
||||
"GraphemeLink",
|
||||
"IDS_BinaryOperator",
|
||||
"IDS_TrinaryOperator",
|
||||
"Radical",
|
||||
"UnifiedIdeograph",
|
||||
"Reserved_Cf_Code_Point",
|
||||
"Deprecated",
|
||||
};
|
||||
|
||||
static final String[] SHORT_BP = {
|
||||
"BidiM",
|
||||
"CExc",
|
||||
"WhSp",
|
||||
"NBrk",
|
||||
"BdCon",
|
||||
"JCon",
|
||||
"Dash",
|
||||
"Hyph",
|
||||
"QMark",
|
||||
"TPunc",
|
||||
"OMath",
|
||||
"HexD",
|
||||
"AHexD",
|
||||
"OAlph",
|
||||
"Ideo",
|
||||
"Diac",
|
||||
"Ext",
|
||||
"OLoc",
|
||||
"OUpc",
|
||||
"NChar",
|
||||
"TurkI",
|
||||
"OGrX",
|
||||
"GrLink",
|
||||
"IDSB",
|
||||
"IDST",
|
||||
"Radical",
|
||||
"UCJK",
|
||||
"RCf",
|
||||
"Dep",
|
||||
};
|
||||
|
||||
/*
|
||||
static final String[] BP_OLD = {
|
||||
"BidiMirrored",
|
||||
"CompositionExclusion",
|
||||
"White_space",
|
||||
"Non_break",
|
||||
"Bidi_Control",
|
||||
"Join_Control",
|
||||
"Dash",
|
||||
"Hyphen",
|
||||
"Quotation_Mark",
|
||||
"Terminal_Punctuation",
|
||||
"Math",
|
||||
"Hex_Digit",
|
||||
"Other_Alphabetic",
|
||||
"Ideographic",
|
||||
"Diacritic",
|
||||
"Extender",
|
||||
"Other_Lowercase",
|
||||
"Other_Uppercase",
|
||||
"Noncharacter_Code_Point",
|
||||
"Other_GraphemeExtend",
|
||||
"GraphemeLink",
|
||||
"IDS_BinaryOperator",
|
||||
"IDS_TrinaryOperator",
|
||||
"Radical",
|
||||
"UnifiedIdeograph"
|
||||
};
|
||||
*/
|
||||
|
||||
static final String[] DeletedProperties = {
|
||||
"Private_Use",
|
||||
"Composite",
|
||||
"Format_Control",
|
||||
"High_Surrogate",
|
||||
"Identifier_Part_Not_Cf",
|
||||
"Low_Surrogate",
|
||||
"Other_Format_Control",
|
||||
"Private_Use_High_Surrogate",
|
||||
"Unassigned_Code_Point"
|
||||
};
|
||||
|
||||
static final String[] YN_TABLE = {"N", "Y"};
|
||||
|
||||
static String[] EA = {
|
||||
"N", "A", "H", "W", "F", "Na"
|
||||
};
|
||||
|
||||
static String[] SHORT_EA = {
|
||||
"Neutral", "Ambiguous", "Halfwidth", "Wide", "Fullwidth", "Narrow"
|
||||
};
|
||||
|
||||
static final String[] LB = {
|
||||
"XX", "OP", "CL", "QU", "GL", "NS", "EX", "SY",
|
||||
"IS", "PR", "PO", "NU", "AL", "ID", "IN", "HY",
|
||||
"CM", "BB", "BA", "SP", "BK", "CR", "LF", "CB",
|
||||
"SA", "AI", "B2", "SG", "ZW"
|
||||
};
|
||||
|
||||
static final String[] LONG_LB = {
|
||||
"Unknown", "OpenPunctuation", "ClosePunctuation", "Quotation",
|
||||
"Glue", "Nonstarter", "Exclamation", "BreakSymbols",
|
||||
"InfixNumeric", "PrefixNumeric", "PostfixNumeric",
|
||||
"Numeric", "Alphabetic", "Ideographic", "Inseperable", "Hyphen",
|
||||
"CombiningMark", "BreakBefore", "BreakAfter", "Space",
|
||||
"MandatoryBreak", "CarriageReturn", "LineFeed", "ContingentBreak",
|
||||
"ComplexContext", "Ambiguous", "BreakBeforeAndAfter", "Surrogate", "ZWSpace"
|
||||
};
|
||||
|
||||
public static final String[] SCRIPT = {
|
||||
"COMMON", // COMMON -- NOT A LETTER: NO EXACT CORRESPONDENCE IN 15924
|
||||
"LATIN", // LATIN
|
||||
"GREEK", // GREEK
|
||||
"CYRILLIC", // CYRILLIC
|
||||
"ARMENIAN", // ARMENIAN
|
||||
"HEBREW", // HEBREW
|
||||
"ARABIC", // ARABIC
|
||||
"SYRIAC", // SYRIAC
|
||||
"THAANA", // THAANA
|
||||
"DEVANAGARI", // DEVANAGARI
|
||||
"BENGALI", // BENGALI
|
||||
"GURMUKHI", // GURMUKHI
|
||||
"GUJARATI", // GUJARATI
|
||||
"ORIYA", // ORIYA
|
||||
"TAMIL", // TAMIL
|
||||
"TELUGU", // TELUGU
|
||||
"KANNADA", // KANNADA
|
||||
"MALAYALAM", // MALAYALAM
|
||||
"SINHALA", // SINHALA
|
||||
"THAI", // THAI
|
||||
"LAO", // LAO
|
||||
"TIBETAN", // TIBETAN
|
||||
"MYANMAR", // MYANMAR
|
||||
"GEORGIAN", // GEORGIAN
|
||||
"<unused>", // JAMO -- NOT SEPARATED FROM HANGUL IN 15924
|
||||
"HANGUL", // HANGUL
|
||||
"ETHIOPIC", // ETHIOPIC
|
||||
"CHEROKEE", // CHEROKEE
|
||||
"CANADIAN-ABORIGINAL", // ABORIGINAL
|
||||
"OGHAM", // OGHAM
|
||||
"RUNIC", // RUNIC
|
||||
"KHMER", // KHMER
|
||||
"MONGOLIAN", // MONGOLIAN
|
||||
"HIRAGANA", // HIRAGANA
|
||||
"KATAKANA", // KATAKANA
|
||||
"BOPOMOFO", // BOPOMOFO
|
||||
"HAN", // HAN
|
||||
"YI", // YI
|
||||
"OLD-ITALIC",
|
||||
"GOTHIC",
|
||||
"DESERET",
|
||||
"INHERITED",
|
||||
};
|
||||
|
||||
public static final String[] ABB_SCRIPT = {
|
||||
"Zyyy", // COMMON -- NOT A LETTER: NO EXACT CORRESPONDENCE IN 15924
|
||||
"Latn", // LATIN
|
||||
"Grek", // GREEK
|
||||
"Cyrl", // CYRILLIC
|
||||
"Armn", // ARMENIAN
|
||||
"Hebr", // HEBREW
|
||||
"Arab", // ARABIC
|
||||
"Syrc", // SYRIAC
|
||||
"Thaa", // THAANA
|
||||
"Deva", // DEVANAGARI
|
||||
"Beng", // BENGALI
|
||||
"Guru", // GURMUKHI
|
||||
"Gujr", // GUJARATI
|
||||
"Orya", // ORIYA
|
||||
"Taml", // TAMIL
|
||||
"Telu", // TELUGU
|
||||
"Knda", // KANNADA
|
||||
"Mlym", // MALAYALAM
|
||||
"Sinh", // SINHALA
|
||||
"Thai", // THAI
|
||||
"Laoo", // LAO
|
||||
"Tibt", // TIBETAN
|
||||
"Mymr", // MYANMAR
|
||||
"Geor", // GEORGIAN
|
||||
"<unused>", // JAMO -- NOT SEPARATED FROM HANGUL IN 15924
|
||||
"Hang", // HANGUL
|
||||
"Ethi", // ETHIOPIC
|
||||
"Cher", // CHEROKEE
|
||||
"Cans", // ABORIGINAL
|
||||
"Ogam", // OGHAM
|
||||
"Runr", // RUNIC
|
||||
"Khmr", // KHMER
|
||||
"Mong", // MONGOLIAN
|
||||
"Hira", // HIRAGANA
|
||||
"Kana", // KATAKANA
|
||||
"Bopo", // BOPOMOFO
|
||||
"Hani", // HAN
|
||||
"Yiii", // YI
|
||||
"Ital",
|
||||
"Goth",
|
||||
"Dsrt",
|
||||
"Qaai",
|
||||
};
|
||||
|
||||
|
||||
|
||||
static final String[] AGE = {
|
||||
"UNSPECIFIED",
|
||||
"1.1",
|
||||
"2.0", "2.1",
|
||||
"3.0", "3.1"
|
||||
};
|
||||
|
||||
|
||||
static final String[] GC = {
|
||||
"Cn", // = Other, Not Assigned 0
|
||||
|
||||
"Lu", // = Letter, Uppercase 1
|
||||
"Ll", // = Letter, Lowercase 2
|
||||
"Lt", // = Letter, Titlecase 3
|
||||
"Lm", // = Letter, Modifier 4
|
||||
"Lo", // = Letter, Other 5
|
||||
|
||||
"Mn", // = Mark, Non-Spacing 6
|
||||
"Me", // = Mark, Enclosing 8
|
||||
"Mc", // = Mark, Spacing Combining 7
|
||||
|
||||
"Nd", // = Number, Decimal Digit 9
|
||||
"Nl", // = Number, Letter 10
|
||||
"No", // = Number, Other 11
|
||||
|
||||
"Zs", // = Separator, Space 12
|
||||
"Zl", // = Separator, Line 13
|
||||
"Zp", // = Separator, Paragraph 14
|
||||
|
||||
"Cc", // = Other, Control 15
|
||||
"Cf", // = Other, Format 16
|
||||
"<unused>", // missing
|
||||
"Co", // = Other, Private Use 18
|
||||
"Cs", // = Other, Surrogate 19
|
||||
|
||||
|
||||
"Pd", // = Punctuation, Dash 20
|
||||
"Ps", // = Punctuation, Open 21
|
||||
"Pe", // = Punctuation, Close 22
|
||||
"Pc", // = Punctuation, Connector 23
|
||||
"Po", // = Punctuation, Other 24
|
||||
|
||||
"Sm", // = Symbol, Math 25
|
||||
"Sc", // = Symbol, Currency 26
|
||||
"Sk", // = Symbol, Modifier 27
|
||||
"So", // = Symbol, Other 28
|
||||
|
||||
"Pi", // = Punctuation, Initial quote 29 (may behave like Ps or Pe depending on usage)
|
||||
"Pf" // = Punctuation, Final quote 30 (may behave like Ps or Pe dependingon usage)
|
||||
};
|
||||
|
||||
static final String[] LONG_GC = {
|
||||
"Unassigned", // = Other, Not Assigned 0
|
||||
|
||||
"UppercaseLetter", // = Letter, Uppercase 1
|
||||
"LowercaseLetter", // = Letter, Lowercase 2
|
||||
"TitlecaseLetter", // = Letter, Titlecase 3
|
||||
"ModifierLetter", // = Letter, Modifier 4
|
||||
"OtherLetter", // = Letter, Other 5
|
||||
|
||||
"NonspacingMark", // = Mark, Non-Spacing 6
|
||||
"EnclosingMark", // = Mark, Enclosing 8
|
||||
"SpacingMark", // = Mark, Spacing Combining 7
|
||||
|
||||
"DecimalNumber", // = Number, Decimal Digit 9
|
||||
"LetterNumber", // = Number, Letter 10
|
||||
"OtherNumber", // = Number, Other 11
|
||||
|
||||
"SpaceSeparator", // = Separator, Space 12
|
||||
"LineSeparator", // = Separator, Line 13
|
||||
"ParagraphSeparator", // = Separator, Paragraph 14
|
||||
|
||||
"Control", // = Other, Control 15
|
||||
"Format", // = Other, Format 16
|
||||
"<unused>", // missing
|
||||
"PrivateUse", // = Other, Private Use 18
|
||||
"Surrogate", // = Other, Surrogate 19
|
||||
|
||||
|
||||
"DashPunctuation", // = Punctuation, Dash 20
|
||||
"OpenPunctuation", // = Punctuation, Open 21
|
||||
"ClosePunctuation", // = Punctuation, Close 22
|
||||
"ConnectorPunctuation", // = Punctuation, Connector 23
|
||||
"OtherPunctuation", // = Punctuation, Other 24
|
||||
|
||||
"MathSymbol", // = Symbol, Math 25
|
||||
"CurrencySymbol", // = Symbol, Currency 26
|
||||
"ModifierSymbol", // = Symbol, Modifier 27
|
||||
"OtherSymbol", // = Symbol, Other 28
|
||||
|
||||
"InitialPunctuation", // = Punctuation, Initial quote 29 (may behave like Ps or Pe depending on usage)
|
||||
"FinalPunctuation" // = Punctuation, Final quote 30 (may behave like Ps or Pe dependingon usage)
|
||||
};
|
||||
|
||||
|
||||
|
||||
static String[] BC = {
|
||||
"L", // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
|
||||
"R", // Right-Left; Arabic, Hebrew, and punctuation specific to those scripts
|
||||
"EN", // European Number
|
||||
"ES", // European Number Separator
|
||||
"ET", // European Number Terminator
|
||||
"AN", // Arabic Number
|
||||
"CS", // Common Number Separator
|
||||
"B", // Paragraph Separator
|
||||
"S", // Segment Separator
|
||||
"WS", // Whitespace
|
||||
"ON", // Other Neutrals ; All other characters: punctuation, symbols
|
||||
"<unused>", "BN", "NSM", "AL", "LRO", "RLO", "LRE", "RLE", "PDF"
|
||||
};
|
||||
|
||||
static String[] LONG_BC = {
|
||||
"LeftToRight", // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
|
||||
"RightToLeft", // Right-Left; Arabic, Hebrew, and punctuation specific to those scripts
|
||||
"EuropeanNumber", // European Number
|
||||
"EuropeanSeparator", // European Number Separator
|
||||
"EuropeanTerminator", // European Number Terminator
|
||||
"ArabicNumber", // Arabic Number
|
||||
"CommonSeparator", // Common Number Separator
|
||||
"ParagraphSeparator", // Paragraph Separator
|
||||
"SegmentSeparator", // Segment Separator
|
||||
"WhiteSpace", // Whitespace
|
||||
"OtherNeutral", // Other Neutrals ; All other characters: punctuation, symbols
|
||||
"<unused>",
|
||||
"BoundaryNeutral", "NonspacingMark", "ArabicLetter",
|
||||
"LeftToRightOverride",
|
||||
"RightToLeftOverride", "LeftToRightEmbedding",
|
||||
"RightToLeftEmbedding", "PopDirectionalFormat"
|
||||
};
|
||||
|
||||
private static String[] CASE_TABLE = {
|
||||
"LOWER", "TITLE", "UPPER", "UNCASED"
|
||||
};
|
||||
|
||||
static String[] DT = {
|
||||
"", // NONE
|
||||
"canonical", // CANONICAL
|
||||
"compat", // Otherwise unspecified compatibility character.
|
||||
"font", // A font variant (e.g. a blackletter form).
|
||||
"noBreak", // A no-break version of a space or hyphen.
|
||||
"initial", // // An initial presentation form (Arabic).
|
||||
"medial", // // A medial presentation form (Arabic).
|
||||
"final", // // A final presentation form (Arabic).
|
||||
"isolated", // An isolated presentation form (Arabic).
|
||||
"circle", // An encircled form.
|
||||
"super", // A superscript form.
|
||||
"sub", // A subscript form.
|
||||
"vertical", // A vertical layout presentation form.
|
||||
"wide", // A wide (or zenkaku) compatibility character.
|
||||
"narrow", // A narrow (or hankaku) compatibility character.
|
||||
"small", // A small variant form (CNS compatibility).
|
||||
"square", // A CJK squared font variant.
|
||||
"fraction", // A vulgar fraction form.
|
||||
};
|
||||
|
||||
static String[] SHORT_DT = {
|
||||
"", // NONE
|
||||
"ca", // CANONICAL
|
||||
"co", // Otherwise unspecified compatibility character.
|
||||
"fo", // A font variant (e.g. a blackletter form).
|
||||
"nb", // A no-break version of a space or hyphen.
|
||||
"in", // // An initial presentation form (Arabic).
|
||||
"me", // // A medial presentation form (Arabic).
|
||||
"fi", // // A final presentation form (Arabic).
|
||||
"is", // An isolated presentation form (Arabic).
|
||||
"ci", // An encircled form.
|
||||
"sp", // A superscript form.
|
||||
"sb", // A subscript form.
|
||||
"ve", // A vertical layout presentation form.
|
||||
"wi", // A wide (or zenkaku) compatibility character.
|
||||
"na", // A narrow (or hankaku) compatibility character.
|
||||
"sm", // A small variant form (CNS compatibility).
|
||||
"sq", // A CJK squared font variant.
|
||||
"fr", // A vulgar fraction form.
|
||||
};
|
||||
|
||||
static private String[] MIRRORED_TABLE = {
|
||||
"N",
|
||||
"Y"
|
||||
};
|
||||
|
||||
static String[] NT = {
|
||||
"",
|
||||
"numeric",
|
||||
"digit",
|
||||
"decimal",
|
||||
};
|
||||
|
||||
static String[] SHORT_NT = {
|
||||
"",
|
||||
"nu",
|
||||
"di",
|
||||
"de",
|
||||
};
|
||||
|
||||
static {
|
||||
if (LIMIT_CATEGORY != GC.length) {
|
||||
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: category");
|
||||
}
|
||||
if (LIMIT_BIDI_CLASS != BC.length) {
|
||||
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: bidi");
|
||||
}
|
||||
if (LIMIT_LINE_BREAK != LB.length) {
|
||||
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: linebreak");
|
||||
}
|
||||
if (LIMIT_DECOMPOSITION_TYPE != DT.length) {
|
||||
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: compat type");
|
||||
}
|
||||
if (MIRRORED_LIMIT != MIRRORED_TABLE.length) {
|
||||
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: compat type");
|
||||
}
|
||||
if (MIRRORED_LIMIT != MIRRORED_TABLE.length) {
|
||||
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: compat type");
|
||||
}
|
||||
if (CASE_LIMIT != CASE_TABLE.length) {
|
||||
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: case");
|
||||
}
|
||||
if (LIMIT_NUMERIC_TYPE != NT.length) {
|
||||
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: case");
|
||||
}
|
||||
if (LIMIT_EAST_ASIAN_WIDTH != EA.length) {
|
||||
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: east Asian Width");
|
||||
}
|
||||
if (LIMIT_BINARY_PROPERTIES != BP.length) {
|
||||
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: binary properties");
|
||||
}
|
||||
if (LIMIT_SCRIPT != SCRIPT.length) {
|
||||
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: script");
|
||||
}
|
||||
if (LIMIT_AGE != AGE.length) {
|
||||
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: age");
|
||||
}
|
||||
}
|
||||
|
||||
public static byte ON = Utility.lookup("ON", BC);
|
||||
|
||||
public static String[] JOINING_TYPE = {
|
||||
"C",
|
||||
"D",
|
||||
"R",
|
||||
"U",
|
||||
"L",
|
||||
"T"
|
||||
};
|
||||
|
||||
public static String[] LONG_JOINING_TYPE = {
|
||||
"JoinCausing",
|
||||
"DualJoining",
|
||||
"RightJoining",
|
||||
"NonJoining",
|
||||
"LeftJoining",
|
||||
"Transparent"
|
||||
};
|
||||
|
||||
public static String[] JOINING_GROUP = {
|
||||
"NO_JOINING_GROUP",
|
||||
"AIN",
|
||||
"ALAPH",
|
||||
"ALEF",
|
||||
"BEH",
|
||||
"BETH",
|
||||
"DAL",
|
||||
"DALATH_RISH",
|
||||
"E",
|
||||
"FEH",
|
||||
"FINAL_SEMKATH",
|
||||
"GAF",
|
||||
"GAMAL",
|
||||
"HAH",
|
||||
"HAMZA_ON_HEH_GOAL",
|
||||
"HE",
|
||||
"HEH",
|
||||
"HEH_GOAL",
|
||||
"HETH",
|
||||
"KAF",
|
||||
"KAPH",
|
||||
"KNOTTED_HEH",
|
||||
"LAM",
|
||||
"LAMADH",
|
||||
"MEEM",
|
||||
"MIM",
|
||||
"NOON",
|
||||
"NUN",
|
||||
"PE",
|
||||
"QAF",
|
||||
"QAPH",
|
||||
"REH",
|
||||
"REVERSED_PE",
|
||||
"SAD",
|
||||
"SADHE",
|
||||
"SEEN",
|
||||
"SEMKATH",
|
||||
"SHIN",
|
||||
"SWASH_KAF",
|
||||
"TAH",
|
||||
"TAW",
|
||||
"TEH_MARBUTA",
|
||||
"TETH",
|
||||
"WAW",
|
||||
"YEH",
|
||||
"YEH_BARREE",
|
||||
"YEH_WITH_TAIL",
|
||||
"YUDH",
|
||||
"YUDH_HE",
|
||||
"ZAIN",
|
||||
};
|
||||
|
||||
public static String[] OLD_JOINING_GROUP = {
|
||||
"<no shaping>",
|
||||
"AIN",
|
||||
"ALAPH",
|
||||
"ALEF",
|
||||
"BEH",
|
||||
"BETH",
|
||||
"DAL",
|
||||
"DALATH RISH",
|
||||
"E",
|
||||
"FEH",
|
||||
"FINAL SEMKATH",
|
||||
"GAF",
|
||||
"GAMAL",
|
||||
"HAH",
|
||||
"HAMZA ON HEH GOAL",
|
||||
"HE",
|
||||
"HEH",
|
||||
"HEH GOAL",
|
||||
"HETH",
|
||||
"KAF",
|
||||
"KAPH",
|
||||
"KNOTTED HEH",
|
||||
"LAM",
|
||||
"LAMADH",
|
||||
"MEEM",
|
||||
"MIM",
|
||||
"NOON",
|
||||
"NUN",
|
||||
"PE",
|
||||
"QAF",
|
||||
"QAPH",
|
||||
"REH",
|
||||
"REVERSED PE",
|
||||
"SAD",
|
||||
"SADHE",
|
||||
"SEEN",
|
||||
"SEMKATH",
|
||||
"SHIN",
|
||||
"SWASH KAF",
|
||||
"TAH",
|
||||
"TAW",
|
||||
"TEH MARBUTA",
|
||||
"TETH",
|
||||
"WAW",
|
||||
"YEH",
|
||||
"YEH BARREE",
|
||||
"YEH WITH TAIL",
|
||||
"YUDH",
|
||||
"YUDH HE",
|
||||
"ZAIN",
|
||||
};
|
||||
|
||||
|
||||
|
||||
static String[] JAMO_L_TABLE = {
|
||||
// Value; Short Name; Unicode Name
|
||||
"G", // U+1100; G; HANGUL CHOSEONG KIYEOK
|
||||
"GG", // U+1101; GG; HANGUL CHOSEONG SSANGKIYEOK
|
||||
"N", // U+1102; N; HANGUL CHOSEONG NIEUN
|
||||
"D", // U+1103; D; HANGUL CHOSEONG TIKEUT
|
||||
"DD", // U+1104; DD; HANGUL CHOSEONG SSANGTIKEUT
|
||||
"R", // U+1105; L; HANGUL CHOSEONG RIEUL
|
||||
"M", // U+1106; M; HANGUL CHOSEONG MIEUM
|
||||
"B", // U+1107; B; HANGUL CHOSEONG PIEUP
|
||||
"BB", // U+1108; BB; HANGUL CHOSEONG SSANGPIEUP
|
||||
"S", // U+1109; S; HANGUL CHOSEONG SIOS
|
||||
"SS", // U+110A; SS; HANGUL CHOSEONG SSANGSIOS
|
||||
"", // U+110B; ; HANGUL CHOSEONG IEUNG
|
||||
"J", // U+110C; J; HANGUL CHOSEONG CIEUC
|
||||
"JJ", // U+110D; JJ; HANGUL CHOSEONG SSANGCIEUC
|
||||
"C", // U+110E; C; HANGUL CHOSEONG CHIEUCH
|
||||
"K", // U+110F; K; HANGUL CHOSEONG KHIEUKH
|
||||
"T", // U+1110; T; HANGUL CHOSEONG THIEUTH
|
||||
"P", // U+1111; P; HANGUL CHOSEONG PHIEUPH
|
||||
"H" // U+1112; H; HANGUL CHOSEONG HIEUH
|
||||
};
|
||||
|
||||
static String[] JAMO_V_TABLE = {
|
||||
// Value; Short Name; Unicode Name
|
||||
"A", // U+1161; A; HANGUL JUNGSEONG A
|
||||
"AE", // U+1162; AE; HANGUL JUNGSEONG AE
|
||||
"YA", // U+1163; YA; HANGUL JUNGSEONG YA
|
||||
"YAE", // U+1164; YAE; HANGUL JUNGSEONG YAE
|
||||
"EO", // U+1165; EO; HANGUL JUNGSEONG EO
|
||||
"E", // U+1166; E; HANGUL JUNGSEONG E
|
||||
"YEO", // U+1167; YEO; HANGUL JUNGSEONG YEO
|
||||
"YE", // U+1168; YE; HANGUL JUNGSEONG YE
|
||||
"O", // U+1169; O; HANGUL JUNGSEONG O
|
||||
"WA", // U+116A; WA; HANGUL JUNGSEONG WA
|
||||
"WAE", // U+116B; WAE; HANGUL JUNGSEONG WAE
|
||||
"OE", // U+116C; OE; HANGUL JUNGSEONG OE
|
||||
"YO", // U+116D; YO; HANGUL JUNGSEONG YO
|
||||
"U", // U+116E; U; HANGUL JUNGSEONG U
|
||||
"WEO", // U+116F; WEO; HANGUL JUNGSEONG WEO
|
||||
"WE", // U+1170; WE; HANGUL JUNGSEONG WE
|
||||
"WI", // U+1171; WI; HANGUL JUNGSEONG WI
|
||||
"YU", // U+1172; YU; HANGUL JUNGSEONG YU
|
||||
"EU", // U+1173; EU; HANGUL JUNGSEONG EU
|
||||
"YI", // U+1174; YI; HANGUL JUNGSEONG YI
|
||||
"I", // U+1175; I; HANGUL JUNGSEONG I
|
||||
};
|
||||
|
||||
static String[] JAMO_T_TABLE = {
|
||||
// Value; Short Name; Unicode Name
|
||||
"", // filler, for LV syllable
|
||||
"G", // U+11A8; G; HANGUL JONGSEONG KIYEOK
|
||||
"GG", // U+11A9; GG; HANGUL JONGSEONG SSANGKIYEOK
|
||||
"GS", // U+11AA; GS; HANGUL JONGSEONG KIYEOK-SIOS
|
||||
"N", // U+11AB; N; HANGUL JONGSEONG NIEUN
|
||||
"NJ", // U+11AC; NJ; HANGUL JONGSEONG NIEUN-CIEUC
|
||||
"NH", // U+11AD; NH; HANGUL JONGSEONG NIEUN-HIEUH
|
||||
"D", // U+11AE; D; HANGUL JONGSEONG TIKEUT
|
||||
"L", // U+11AF; L; HANGUL JONGSEONG RIEUL
|
||||
"LG", // U+11B0; LG; HANGUL JONGSEONG RIEUL-KIYEOK
|
||||
"LM", // U+11B1; LM; HANGUL JONGSEONG RIEUL-MIEUM
|
||||
"LB", // U+11B2; LB; HANGUL JONGSEONG RIEUL-PIEUP
|
||||
"LS", // U+11B3; LS; HANGUL JONGSEONG RIEUL-SIOS
|
||||
"LT", // U+11B4; LT; HANGUL JONGSEONG RIEUL-THIEUTH
|
||||
"LP", // U+11B5; LP; HANGUL JONGSEONG RIEUL-PHIEUPH
|
||||
"LH", // U+11B6; LH; HANGUL JONGSEONG RIEUL-HIEUH
|
||||
"M", // U+11B7; M; HANGUL JONGSEONG MIEUM
|
||||
"B", // U+11B8; B; HANGUL JONGSEONG PIEUP
|
||||
"BS", // U+11B9; BS; HANGUL JONGSEONG PIEUP-SIOS
|
||||
"S", // U+11BA; S; HANGUL JONGSEONG SIOS
|
||||
"SS", // U+11BB; SS; HANGUL JONGSEONG SSANGSIOS
|
||||
"NG", // U+11BC; NG; HANGUL JONGSEONG IEUNG
|
||||
"J", // U+11BD; J; HANGUL JONGSEONG CIEUC
|
||||
"C", // U+11BE; C; HANGUL JONGSEONG CHIEUCH
|
||||
"K", // U+11BF; K; HANGUL JONGSEONG KHIEUKH
|
||||
"T", // U+11C0; T; HANGUL JONGSEONG THIEUTH
|
||||
"P", // U+11C1; P; HANGUL JONGSEONG PHIEUPH
|
||||
"H", // U+11C2; H; HANGUL JONGSEONG HIEUH
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*
|
||||
static {
|
||||
UNASSIGNED_INFO.code = '\uFFFF';
|
||||
UNASSIGNED_INFO.name = "<reserved>";
|
||||
UNASSIGNED_INFO.decomposition = "";
|
||||
UNASSIGNED_INFO.fullCanonicalDecomposition = "";
|
||||
UNASSIGNED_INFO.fullCompatibilityDecomposition = "";
|
||||
UNASSIGNED_INFO.name10 = "";
|
||||
UNASSIGNED_INFO.comment = "";
|
||||
|
||||
UNASSIGNED_INFO.numericType = NONE;
|
||||
UNASSIGNED_INFO.decompositionType = NONE;
|
||||
|
||||
UNASSIGNED_INFO.category = lookup("Cn",CATEGORY_TABLE, "PROXY");
|
||||
UNASSIGNED_INFO.canonical = 0;
|
||||
|
||||
UNASSIGNED_INFO.uppercase = "";
|
||||
UNASSIGNED_INFO.lowercase = "";
|
||||
UNASSIGNED_INFO.titlecase = "";
|
||||
|
||||
UNASSIGNED_INFO.bidi = ON;
|
||||
|
||||
UNASSIGNED_INFO.mirrored = NO;
|
||||
}
|
||||
*/
|
||||
}
|
374
tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
Normal file
374
tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
Normal file
|
@ -0,0 +1,374 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
public interface UCD_Types {
|
||||
public static final String DATA_DIR = "C:\\DATA\\";
|
||||
public static final String BIN_DIR = DATA_DIR + "\\BIN\\";
|
||||
public static final String GEN_DIR = DATA_DIR + "\\GEN\\";
|
||||
|
||||
|
||||
static final byte BINARY_FORMAT = 5; // bumped if binary format of UCD changes
|
||||
/*
|
||||
0 Code value in 4-digit hexadecimal format.
|
||||
1 Unicode 2.1 Character Name. These names match exactly the
|
||||
2 General Category. This is a useful breakdown into various "character
|
||||
3 Canonical Combining Classes. The classes used for the
|
||||
4 Bidirectional Category. See the list below for an explanation of the
|
||||
5 Character Decomposition. In the Unicode Standard, not all of
|
||||
6 Decimal digit value. This is a numeric field. If the character
|
||||
7 Digit value. This is a numeric field. If the character represents a
|
||||
8 Numeric value. This is a numeric field. If the character has the
|
||||
9 If the characters has been identified as a "mirrored" character in
|
||||
10 Unicode 1.0 Name. This is the old name as published in Unicode 1.0.
|
||||
11 10646 Comment field. This field is informative.
|
||||
12 Upper case equivalent mapping. If a character is part of an
|
||||
13 Lower case equivalent mapping. Similar to 12. This field is informative.
|
||||
14 Title case equivalent mapping. Similar to 12. This field is informative.
|
||||
*/
|
||||
|
||||
// Binary ENUM Grouping
|
||||
public static final int
|
||||
CATEGORY = 0,
|
||||
COMBINING_CLASS = 0x100,
|
||||
BIDI_CLASS = 0x200,
|
||||
DECOMPOSITION_TYPE = 0x300,
|
||||
NUMERIC_TYPE = 0x400,
|
||||
EAST_ASIAN_WIDTH = 0x500,
|
||||
LINE_BREAK = 0x600,
|
||||
JOINING_TYPE = 0x700,
|
||||
JOINING_GROUP = 0x800,
|
||||
BINARY_PROPERTIES = 0x900,
|
||||
SCRIPT = 0xA00,
|
||||
AGE = 0xB00,
|
||||
NEXT_ENUM = 0x100,
|
||||
LIMIT_ENUM = AGE + 0x100;
|
||||
|
||||
public static final int LIMIT_COMBINING_CLASS = 256;
|
||||
|
||||
// getCategory
|
||||
public static final byte
|
||||
UNASSIGNED = 0,
|
||||
UPPERCASE_LETTER = 1,
|
||||
LOWERCASE_LETTER = 2,
|
||||
TITLECASE_LETTER = 3,
|
||||
MODIFIER_LETTER = 4,
|
||||
OTHER_LETTER = 5,
|
||||
NON_SPACING_MARK = 6,
|
||||
ENCLOSING_MARK = 7,
|
||||
COMBINING_SPACING_MARK = 8,
|
||||
DECIMAL_DIGIT_NUMBER = 9,
|
||||
LETTER_NUMBER = 10,
|
||||
OTHER_NUMBER = 11,
|
||||
SPACE_SEPARATOR = 12,
|
||||
LINE_SEPARATOR = 13,
|
||||
PARAGRAPH_SEPARATOR = 14,
|
||||
CONTROL = 15,
|
||||
FORMAT = 16,
|
||||
UNUSED_CATEGORY = 17,
|
||||
PRIVATE_USE = 18,
|
||||
SURROGATE = 19,
|
||||
DASH_PUNCTUATION = 20,
|
||||
START_PUNCTUATION = 21,
|
||||
END_PUNCTUATION = 22,
|
||||
CONNECTOR_PUNCTUATION = 23,
|
||||
OTHER_PUNCTUATION = 24,
|
||||
MATH_SYMBOL = 25,
|
||||
CURRENCY_SYMBOL = 26,
|
||||
MODIFIER_SYMBOL = 27,
|
||||
OTHER_SYMBOL = 28,
|
||||
INITIAL_PUNCTUATION = 29,
|
||||
FINAL_PUNCTUATION = 30,
|
||||
LIMIT_CATEGORY = FINAL_PUNCTUATION+1,
|
||||
|
||||
// Unicode abbreviations
|
||||
Lu = UPPERCASE_LETTER,
|
||||
Ll = LOWERCASE_LETTER,
|
||||
Lt = TITLECASE_LETTER,
|
||||
Lm = MODIFIER_LETTER,
|
||||
Lo = OTHER_LETTER,
|
||||
Mn = NON_SPACING_MARK,
|
||||
Me = ENCLOSING_MARK,
|
||||
Mc = COMBINING_SPACING_MARK,
|
||||
Nd = DECIMAL_DIGIT_NUMBER,
|
||||
Nl = LETTER_NUMBER,
|
||||
No = OTHER_NUMBER,
|
||||
Zs = SPACE_SEPARATOR,
|
||||
Zl = LINE_SEPARATOR,
|
||||
Zp = PARAGRAPH_SEPARATOR,
|
||||
Cc = CONTROL,
|
||||
Cf = FORMAT,
|
||||
Cs = SURROGATE,
|
||||
Co = PRIVATE_USE,
|
||||
Cn = UNASSIGNED,
|
||||
Pc = CONNECTOR_PUNCTUATION,
|
||||
Pd = DASH_PUNCTUATION,
|
||||
Ps = START_PUNCTUATION,
|
||||
Pe = END_PUNCTUATION,
|
||||
Po = OTHER_PUNCTUATION,
|
||||
Pi = INITIAL_PUNCTUATION,
|
||||
Pf = FINAL_PUNCTUATION,
|
||||
Sm = MATH_SYMBOL,
|
||||
Sc = CURRENCY_SYMBOL,
|
||||
Sk = MODIFIER_SYMBOL,
|
||||
So = OTHER_SYMBOL;
|
||||
|
||||
static final int
|
||||
LETTER_MASK = (1<<Lu) | (1<<Ll) | (1<<Lt) | (1<<Lm) | (1 << Lo),
|
||||
MARK_MASK = (1<<Mn) | (1<<Me) | (1<<Mc),
|
||||
NUMBER_MASK = (1<<Nd) | (1<<Nl) | (1<<No),
|
||||
SEPARATOR_MASK = (1<<Zs) | (1<<Zl) | (1<<Zp),
|
||||
CONTROL_MASK = (1<<Cc) | (1<<Cf) | (1<<Cs) | (1<<Co),
|
||||
PUNCTUATION_MASK = (1<<Pc) | (1<<Pd) | (1<<Ps) | (1<<Pe) | (1<<Po) | (1<<Pi) | (1<<Pf),
|
||||
SYMBOL_MASK = (1<<Sm) | (1<<Sc) | (1<<Sk) | (1<<So),
|
||||
UNASSIGNED_MASK = (1<<Cn);
|
||||
|
||||
// Binary Properties
|
||||
|
||||
public static final byte
|
||||
BidiMirrored = 0,
|
||||
CompositionExclusion = 1,
|
||||
White_space = 2,
|
||||
Non_break = 3,
|
||||
Bidi_Control = 4,
|
||||
Join_Control = 5,
|
||||
Dash = 6,
|
||||
Hyphen = 7,
|
||||
Quotation_Mark = 8,
|
||||
Terminal_Punctuation = 9,
|
||||
Math_Property = 10,
|
||||
Hex_Digit = 11,
|
||||
ASCII_Hex_Digit = 12,
|
||||
Alphabetic = 13,
|
||||
Ideographic = 14,
|
||||
Diacritic = 15,
|
||||
Extender = 16,
|
||||
Other_Lowercase = 17,
|
||||
Other_Uppercase = 18,
|
||||
Noncharacter_Code_Point = 19,
|
||||
CaseFoldTurkishI = 20,
|
||||
Other_GraphemeExtend = 21,
|
||||
GraphemeLink = 22,
|
||||
IDS_BinaryOperator = 23,
|
||||
IDS_TrinaryOperator = 24,
|
||||
Radical = 25,
|
||||
UnifiedIdeograph = 26,
|
||||
Reserved_Cf_Code_Point = 27,
|
||||
Deprecated = 28,
|
||||
LIMIT_BINARY_PROPERTIES = 29;
|
||||
|
||||
/*
|
||||
static final int
|
||||
BidiMirroredMask = 1<<BidiMirrored,
|
||||
CompositionExclusionMask = 1<<CompositionExclusion,
|
||||
AlphabeticMask = 1<<Alphabetic,
|
||||
Bidi_ControlMask = 1<<Bidi_Control,
|
||||
DashMask = 1<<Dash,
|
||||
DiacriticMask = 1<<Diacritic,
|
||||
ExtenderMask = 1<<Extender,
|
||||
Hex_DigitMask = 1<<Hex_Digit,
|
||||
HyphenMask = 1<<Hyphen,
|
||||
IdeographicMask = 1<<Ideographic,
|
||||
Join_ControlMask = 1<<Join_Control,
|
||||
Math_PropertyMask = 1<<Math_Property,
|
||||
Non_breakMask = 1<<Non_break,
|
||||
Noncharacter_Code_PointMask = 1<<Noncharacter_Code_Point,
|
||||
Other_LowercaseMask = 1<<Other_Lowercase,
|
||||
Other_UppercaseMask = 1<<Other_Uppercase,
|
||||
Quotation_MarkMask = 1<<Quotation_Mark,
|
||||
Terminal_PunctuationMask = 1<<Terminal_Punctuation,
|
||||
White_spaceMask = 1<<White_space;
|
||||
*/
|
||||
|
||||
// line break
|
||||
public static final byte
|
||||
LBXX = 0, LBOP = 1, LBCL = 2, LBQU = 3, LBGL = 4, LBNS = 5, LBEX = 6, LBSY = 7,
|
||||
LBIS = 8, LBPR = 9, LBPO = 10, LBNU = 11, LBAL = 12, LBID = 13, LBIN = 14, LBHY = 15,
|
||||
LBCM = 16, LBBB = 17, LBBA = 18, LBSP = 19, LBBK = 20, LBCR = 21, LBLF = 22, LBCB = 23,
|
||||
LBSA = 24, LBAI = 25, LBB2 = 26, LBSG = 27, LBZW = 28, LIMIT_LINE_BREAK = 29;
|
||||
|
||||
// east asian width
|
||||
public static final byte
|
||||
EAN = 0, EAA = 1, EAH = 2, EAW = 3, EAF = 4, EANa = 5,
|
||||
LIMIT_EAST_ASIAN_WIDTH = 6;
|
||||
|
||||
// bidi class
|
||||
static final byte
|
||||
BIDI_L = 0, // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
|
||||
BIDI_R = 1, // Right-Left; Arabic, Hebrew, and punctuation specific to those scripts
|
||||
BIDI_EN = 2, // European Number
|
||||
BIDI_ES = 3, // European Number Separator
|
||||
BIDI_ET = 4, // European Number Terminator
|
||||
BIDI_AN = 5, // Arabic Number
|
||||
BIDI_CS = 6, // Common Number Separator
|
||||
BIDI_B = 7, // Block Separator
|
||||
BIDI_S = 8, // Segment Separator
|
||||
BIDI_WS = 9, // Whitespace
|
||||
BIDI_ON = 10, // Other Neutrals ; All other characters: punctuation, symbols
|
||||
LIMIT_BIDI_2 = 11,
|
||||
BIDI_UNUSED = 11,
|
||||
BIDI_BN = 12,
|
||||
BIDI_NSM = 13,
|
||||
BIDI_AL = 14,
|
||||
BIDI_LRO = 15,
|
||||
BIDI_RLO = 16,
|
||||
BIDI_LRE = 17,
|
||||
BIDI_RLE = 18,
|
||||
BIDI_PDF = 19,
|
||||
LIMIT_BIDI_CLASS = 20;
|
||||
|
||||
// decompositionType
|
||||
static final byte NONE = 0,
|
||||
CANONICAL = 1,
|
||||
COMPATIBILITY = 2,
|
||||
COMPAT_UNSPECIFIED = 2, // Otherwise unspecified compatibility character.
|
||||
COMPAT_FONT = 3, // A font variant (e.g. a blackletter form).
|
||||
COMPAT_NOBREAK = 4, // A no-break version of a space or hyphen.
|
||||
COMPAT_INITIAL = 5, // // An initial presentation form (Arabic).
|
||||
COMPAT_MEDIAL = 6, // // A medial presentation form (Arabic).
|
||||
COMPAT_FINAL = 7, // // A final presentation form (Arabic).
|
||||
COMPAT_ISOLATED = 8, // An isolated presentation form (Arabic).
|
||||
COMPAT_CIRCLE = 9, // An encircled form.
|
||||
COMPAT_SUPER = 10, // A superscript form.
|
||||
COMPAT_SUB = 11, // A subscript form.
|
||||
COMPAT_VERTICAL = 12, // A vertical layout presentation form.
|
||||
COMPAT_WIDE = 13, // A wide (or zenkaku) compatibility character.
|
||||
COMPAT_NARROW = 14, // A narrow (or hankaku) compatibility character.
|
||||
COMPAT_SMALL = 15, // A small variant form (CNS compatibility).
|
||||
COMPAT_SQUARE = 16, // A CJK squared font variant.
|
||||
COMPAT_FRACTION = 17, // A vulgar fraction form.
|
||||
LIMIT_DECOMPOSITION_TYPE = 18;
|
||||
|
||||
// mirrored type
|
||||
static final byte NO = 0, YES = 1, MIRRORED_LIMIT = 2;
|
||||
|
||||
// for QuickCheck
|
||||
static final byte QNO = 0, QMAYBE = 1, QYES = 2;
|
||||
|
||||
// case type
|
||||
static final byte LOWER = 0, TITLE = 1, UPPER = 2, UNCASED = 3, FOLD = 3, CASE_LIMIT = 4;
|
||||
static final byte SIMPLE = 0, FULL = 8;
|
||||
|
||||
// normalization type
|
||||
static final byte UNNORMALIZED = 0, C = 1, KC = 2, D = 3, KD = 4, FORM_LIMIT = 5;
|
||||
|
||||
// numericType
|
||||
static final byte NUMERIC_NONE = 0, NUMERIC = 1, DIGIT = 2, DECIMAL = 3,
|
||||
LIMIT_NUMERIC_TYPE = 4;
|
||||
|
||||
public static final byte // SCRIPT CODE
|
||||
COMMON_SCRIPT = 0,
|
||||
LATIN_SCRIPT = 1,
|
||||
GREEK_SCRIPT = 2,
|
||||
CYRILLIC_SCRIPT = 3,
|
||||
ARMENIAN_SCRIPT = 4,
|
||||
HEBREW_SCRIPT = 5,
|
||||
ARABIC_SCRIPT = 6,
|
||||
SYRIAC_SCRIPT = 7,
|
||||
THAANA_SCRIPT = 8,
|
||||
DEVANAGARI_SCRIPT = 9,
|
||||
BENGALI_SCRIPT = 10,
|
||||
GURMUKHI_SCRIPT = 11,
|
||||
GUJARATI_SCRIPT = 12,
|
||||
ORIYA_SCRIPT = 13,
|
||||
TAMIL_SCRIPT = 14,
|
||||
TELUGU_SCRIPT = 15,
|
||||
KANNADA_SCRIPT = 16,
|
||||
MALAYALAM_SCRIPT = 17,
|
||||
SINHALA_SCRIPT = 18,
|
||||
THAI_SCRIPT = 19,
|
||||
LAO_SCRIPT = 20,
|
||||
TIBETAN_SCRIPT = 21,
|
||||
MYANMAR_SCRIPT = 22,
|
||||
GEORGIAN_SCRIPT = 23,
|
||||
UNUSED_SCRIPT = 24,
|
||||
HANGUL_SCRIPT = 25,
|
||||
ETHIOPIC_SCRIPT = 26,
|
||||
CHEROKEE_SCRIPT = 27,
|
||||
ABORIGINAL_SCRIPT = 28,
|
||||
OGHAM_SCRIPT = 29,
|
||||
RUNIC_SCRIPT = 30,
|
||||
KHMER_SCRIPT = 31,
|
||||
MONGOLIAN_SCRIPT = 32,
|
||||
HIRAGANA_SCRIPT = 33,
|
||||
KATAKANA_SCRIPT = 34,
|
||||
BOPOMOFO_SCRIPT = 35,
|
||||
HAN_SCRIPT = 36,
|
||||
YI_SCRIPT = 37,
|
||||
OLD_ITALIC_SCRIPT = 38,
|
||||
GOTHIC_SCRIPT = 39,
|
||||
DESERET_SCRIPT = 40,
|
||||
INHERITED_SCRIPT = 41,
|
||||
LIMIT_SCRIPT = 42;
|
||||
|
||||
static final int
|
||||
UNKNOWN = 0,
|
||||
AGE10 = 1,
|
||||
AGE20 = 2,
|
||||
AGE21 = 3,
|
||||
AGE30 = 4,
|
||||
AGE31 = 5,
|
||||
LIMIT_AGE = 6;
|
||||
|
||||
|
||||
|
||||
public static byte
|
||||
JT_C = 0,
|
||||
JT_D = 1,
|
||||
JT_R = 2,
|
||||
JT_U = 3,
|
||||
JT_L = 4,
|
||||
JT_T = 5,
|
||||
LIMIT_JOINING_TYPE = 6;
|
||||
|
||||
public static byte
|
||||
NO_SHAPING = 0,
|
||||
AIN = 1,
|
||||
ALAPH = 2,
|
||||
ALEF = 3,
|
||||
BEH = 4,
|
||||
BETH = 5,
|
||||
DAL = 6,
|
||||
DALATH_RISH = 7,
|
||||
E = 8,
|
||||
FEH = 9,
|
||||
FINAL_SEMKATH = 10,
|
||||
GAF = 11,
|
||||
GAMAL = 12,
|
||||
HAH = 13,
|
||||
HAMZA_ON_HEH_GOAL = 14,
|
||||
HE = 15,
|
||||
HEH = 16,
|
||||
HEH_GOAL = 17,
|
||||
HETH = 18,
|
||||
KAF = 19,
|
||||
KAPH = 20,
|
||||
KNOTTED_HEH = 21,
|
||||
LAM = 22,
|
||||
LAMADH = 23,
|
||||
MEEM = 24,
|
||||
MIM = 25,
|
||||
NOON = 26,
|
||||
NUN = 27,
|
||||
PE = 28,
|
||||
QAF = 29,
|
||||
QAPH = 30,
|
||||
REH = 31,
|
||||
REVERSED_PE = 32,
|
||||
SAD = 33,
|
||||
SADHE = 34,
|
||||
SEEN = 35,
|
||||
SEMKATH = 36,
|
||||
SHIN = 37,
|
||||
SWASH_KAF = 38,
|
||||
TAH = 39,
|
||||
TAW = 40,
|
||||
TEH_MARBUTA = 41,
|
||||
TETH = 42,
|
||||
WAW = 43,
|
||||
YEH = 44,
|
||||
YEH_BARREE = 45,
|
||||
YEH_WITH_TAIL = 46,
|
||||
YUDH = 47,
|
||||
YUDH_HE = 48,
|
||||
ZAIN = 49,
|
||||
LIMIT_JOINING_GROUP = 50;
|
||||
}
|
317
tools/unicodetools/com/ibm/text/UCD/UData.java
Normal file
317
tools/unicodetools/com/ibm/text/UCD/UData.java
Normal file
|
@ -0,0 +1,317 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.*;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
class UData implements UCD_Types {
|
||||
String name;
|
||||
String decompositionMapping;
|
||||
String simpleUppercase;
|
||||
String simpleLowercase;
|
||||
String simpleTitlecase;
|
||||
String simpleCaseFolding;
|
||||
String fullUppercase;
|
||||
String fullLowercase;
|
||||
String fullTitlecase;
|
||||
String fullCaseFolding;
|
||||
String specialCasing = "";
|
||||
String bidiMirror;
|
||||
|
||||
int codePoint = -1;
|
||||
float numericValue = Float.NaN;
|
||||
int binaryProperties; // bidiMirroring, compositionExclusions, PropList
|
||||
|
||||
byte generalCategory = Cn;
|
||||
byte combiningClass = 0;
|
||||
byte bidiClass = BIDI_ON;
|
||||
byte decompositionType = NONE;
|
||||
byte numericType = NUMERIC_NONE;
|
||||
|
||||
byte eastAsianWidth = EAN;
|
||||
byte lineBreak = LBXX;
|
||||
byte joiningType = JT_U;
|
||||
byte joiningGroup = NO_SHAPING;
|
||||
byte script = COMMON_SCRIPT;
|
||||
byte age = 0;
|
||||
|
||||
static final UData UNASSIGNED = new UData();
|
||||
//static final UData NONCHARACTER = new UData();
|
||||
static {
|
||||
UNASSIGNED.name = "<unassigned>";
|
||||
UNASSIGNED.decompositionMapping = UNASSIGNED.bidiMirror
|
||||
= UNASSIGNED.simpleUppercase
|
||||
= UNASSIGNED.simpleLowercase
|
||||
= UNASSIGNED.simpleTitlecase = "";
|
||||
UNASSIGNED.fleshOut();
|
||||
|
||||
/*NONCHARACTER.name = "<noncharacter>";
|
||||
NONCHARACTER.decompositionMapping = NONCHARACTER.bidiMirror
|
||||
= NONCHARACTER.simpleUppercase
|
||||
= NONCHARACTER.simpleLowercase
|
||||
= NONCHARACTER.simpleTitlecase = "";
|
||||
|
||||
NONCHARACTER.binaryProperties = Noncharacter_Code_PointMask;
|
||||
NONCHARACTER.fleshOut();
|
||||
*/
|
||||
}
|
||||
|
||||
public UData (int codePoint) {
|
||||
this.codePoint = codePoint;
|
||||
}
|
||||
|
||||
public UData () {
|
||||
}
|
||||
|
||||
public boolean equals(Object that) {
|
||||
UData other = (UData) that;
|
||||
if (!name.equals(other.name)) return false;
|
||||
if (!decompositionMapping.equals(other.decompositionMapping)) return false;
|
||||
if (!simpleUppercase.equals(other.simpleUppercase)) return false;
|
||||
if (!simpleLowercase.equals(other.simpleLowercase)) return false;
|
||||
if (!simpleTitlecase.equals(other.simpleTitlecase)) return false;
|
||||
if (!simpleCaseFolding.equals(other.simpleCaseFolding)) return false;
|
||||
if (!fullUppercase.equals(other.fullUppercase)) return false;
|
||||
if (!fullLowercase.equals(other.fullLowercase)) return false;
|
||||
if (!fullTitlecase.equals(other.fullTitlecase)) return false;
|
||||
if (!fullCaseFolding.equals(other.fullCaseFolding)) return false;
|
||||
if (!specialCasing.equals(other.specialCasing)) return false;
|
||||
if (!bidiMirror.equals(other.bidiMirror)) return false;
|
||||
if (codePoint != other.codePoint) return false;
|
||||
if (numericValue != other.numericValue) return false;
|
||||
if (binaryProperties != other.binaryProperties) return false;
|
||||
if (generalCategory != other.generalCategory) return false;
|
||||
if (combiningClass != other.combiningClass) return false;
|
||||
if (bidiClass != other.bidiClass) return false;
|
||||
if (decompositionType != other.decompositionType) return false;
|
||||
if (numericType != other.numericType) return false;
|
||||
if (eastAsianWidth != other.eastAsianWidth) return false;
|
||||
if (lineBreak != other.lineBreak) return false;
|
||||
if (joiningType != other.joiningType) return false;
|
||||
if (joiningGroup != other.joiningGroup) return false;
|
||||
if (script != other.script) return false;
|
||||
if (age != other.age) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
public void fleshOut() {
|
||||
String codeValue = UTF32.valueOf32(codePoint);
|
||||
|
||||
if (decompositionMapping == null) decompositionMapping = codeValue;
|
||||
if (bidiMirror == null) bidiMirror = codeValue;
|
||||
|
||||
if (simpleLowercase == null) simpleLowercase = codeValue;
|
||||
if (simpleCaseFolding == null) simpleCaseFolding = simpleLowercase;
|
||||
if (fullLowercase == null) fullLowercase = simpleLowercase;
|
||||
if (fullCaseFolding == null) fullCaseFolding = fullLowercase;
|
||||
|
||||
if (simpleUppercase == null) simpleUppercase = codeValue;
|
||||
if (simpleTitlecase == null) simpleTitlecase = codeValue;
|
||||
if (fullUppercase == null) fullUppercase = simpleUppercase;
|
||||
|
||||
if (fullTitlecase == null) fullTitlecase = simpleTitlecase;
|
||||
}
|
||||
|
||||
public void compact() {
|
||||
fleshOut();
|
||||
String codeValue = UTF32.valueOf32(codePoint);
|
||||
|
||||
if (fullTitlecase.equals(simpleTitlecase)) fullTitlecase = null;
|
||||
|
||||
if (fullUppercase.equals(simpleUppercase)) fullUppercase = null;
|
||||
if (simpleTitlecase.equals(codeValue)) simpleTitlecase = null;
|
||||
if (simpleUppercase.equals(codeValue)) simpleUppercase = null;
|
||||
|
||||
if (fullCaseFolding.equals(fullLowercase)) fullCaseFolding = null;
|
||||
if (fullLowercase.equals(simpleLowercase)) fullLowercase = null;
|
||||
if (simpleCaseFolding.equals(simpleLowercase)) simpleCaseFolding = null;
|
||||
if (simpleLowercase.equals(codeValue)) simpleLowercase = null;
|
||||
|
||||
if (decompositionMapping.equals(codeValue)) decompositionMapping = null;
|
||||
if (bidiMirror.equals(codeValue)) bidiMirror = null;
|
||||
}
|
||||
|
||||
public void setBinaryProperties(int binaryProperties) {
|
||||
this.binaryProperties = binaryProperties;
|
||||
}
|
||||
|
||||
public boolean isLetter() {
|
||||
return ((1<<generalCategory) & UCD_Types.LETTER_MASK) != 0;
|
||||
}
|
||||
|
||||
public static void writeString(DataOutputStream os, String s) throws IOException {
|
||||
if (s == null) {
|
||||
os.writeByte(0);
|
||||
} else {
|
||||
os.writeByte(1);
|
||||
os.writeUTF(s);
|
||||
}
|
||||
}
|
||||
|
||||
static final byte[] byteBuffer = new byte[256];
|
||||
|
||||
public static String readString(DataInputStream is) throws IOException {
|
||||
int type = is.readUnsignedByte();
|
||||
if (type == 0) return null;
|
||||
return is.readUTF();
|
||||
}
|
||||
|
||||
static final byte ABBREVIATED = 0, FULL = 1;
|
||||
|
||||
public String toString() {
|
||||
return toString(FULL);
|
||||
}
|
||||
|
||||
public String toString(byte style) {
|
||||
boolean full = style == FULL;
|
||||
StringBuffer result = new StringBuffer();
|
||||
String s = UTF32.valueOf32(codePoint);
|
||||
|
||||
result.append("<e c='").append(Utility.quoteXML(codePoint)).append('\'');
|
||||
result.append(" hx='").append(Utility.hex(codePoint)).append('\'');
|
||||
if (full || script != COMMON_SCRIPT) result.append(" sn='").append(UCD_Names.SCRIPT[script]).append('\'');
|
||||
result.append(" n='").append(Utility.quoteXML(name)).append("'\r\n");
|
||||
|
||||
int lastPos = result.length();
|
||||
|
||||
if (full || generalCategory != Lo) result.append(" gc='").append(UCD_Names.GC[generalCategory]).append('\'');
|
||||
if (full || combiningClass != 0) result.append(" cc='").append(combiningClass & 0xFF).append('\'');
|
||||
if (full || decompositionType != NONE) result.append(" dt='").append(UCD_Names.DT[decompositionType]).append('\'');
|
||||
if (full || !s.equals(decompositionMapping)) result.append(" dm='").append(Utility.quoteXML(decompositionMapping)).append('\'');
|
||||
|
||||
if (full || numericType != NUMERIC_NONE) result.append(" nt='").append(UCD_Names.NT[numericType]).append('\'');
|
||||
if (full || !Double.isNaN(numericValue)) result.append(" nv='").append(numericValue).append('\'');
|
||||
|
||||
if (full || eastAsianWidth != EAN) result.append(" ea='").append(UCD_Names.EA[eastAsianWidth]).append('\'');
|
||||
if (full || lineBreak != LBAL) result.append(" lb='").append(UCD_Names.LB[lineBreak]).append('\'');
|
||||
if (full || joiningType != JT_U) result.append(" jt='").append(UCD_Names.JOINING_TYPE[joiningType]).append('\'');
|
||||
if (full || joiningGroup != NO_SHAPING) result.append(" jg='").append(UCD_Names.JOINING_GROUP[joiningGroup]).append('\'');
|
||||
if (full || age != 0) result.append(" ag='").append(UCD_Names.AGE[age]).append('\'');
|
||||
|
||||
if (full || bidiClass != BIDI_L) result.append(" bc='").append(UCD_Names.BC[bidiClass]).append('\'');
|
||||
if (full || !bidiMirror.equals(s)) result.append(" bmg='").append(Utility.quoteXML(bidiMirror)).append('\'');
|
||||
|
||||
if (lastPos != result.length()) {
|
||||
result.append("\r\n");
|
||||
lastPos = result.length();
|
||||
}
|
||||
|
||||
//String bp = "";
|
||||
int bprops = binaryProperties;
|
||||
for (int i = 0; i < LIMIT_BINARY_PROPERTIES; ++i) {
|
||||
if ((bprops & (1<<i)) != 0) result.append(UCD_Names.BP[i]).append("='T' ");
|
||||
}
|
||||
if (lastPos != result.length()) {
|
||||
result.append("\r\n");
|
||||
lastPos = result.length();
|
||||
}
|
||||
|
||||
if (full || !fullLowercase.equals(s)) result.append(" lc='").append(Utility.quoteXML(fullLowercase)).append('\'');
|
||||
if (full || !fullUppercase.equals(simpleUppercase)) result.append(" uc='").append(Utility.quoteXML(fullUppercase)).append('\'');
|
||||
if (full || !fullTitlecase.equals(fullUppercase)) result.append(" tc='").append(Utility.quoteXML(fullTitlecase)).append('\'');
|
||||
if (full || !fullCaseFolding.equals(fullLowercase)) result.append(" cf='").append(Utility.quoteXML(fullCaseFolding)).append('\'');
|
||||
|
||||
if (full || !simpleLowercase.equals(simpleLowercase)) result.append(" slc='").append(Utility.quoteXML(simpleLowercase)).append('\'');
|
||||
if (full || !simpleUppercase.equals(simpleUppercase)) result.append(" suc='").append(Utility.quoteXML(simpleUppercase)).append('\'');
|
||||
if (full || !simpleTitlecase.equals(simpleUppercase)) result.append(" stc='").append(Utility.quoteXML(simpleTitlecase)).append('\'');
|
||||
if (full || !simpleCaseFolding.equals(simpleLowercase)) result.append(" sfc='").append(Utility.quoteXML(simpleCaseFolding)).append('\'');
|
||||
|
||||
if (full || !specialCasing.equals("")) result.append(" fsc='").append(Utility.quoteXML(specialCasing)).append('\'');
|
||||
result.append("/>");
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public void writeBytes(DataOutputStream os) throws IOException {
|
||||
compact();
|
||||
os.writeInt(codePoint);
|
||||
|
||||
writeString(os, name);
|
||||
writeString(os, decompositionMapping);
|
||||
writeString(os, simpleUppercase);
|
||||
writeString(os, simpleLowercase);
|
||||
writeString(os, simpleTitlecase);
|
||||
writeString(os, simpleCaseFolding);
|
||||
writeString(os, fullUppercase);
|
||||
writeString(os, fullLowercase);
|
||||
writeString(os, fullTitlecase);
|
||||
writeString(os, fullCaseFolding);
|
||||
writeString(os, specialCasing);
|
||||
writeString(os, bidiMirror);
|
||||
|
||||
os.writeFloat(numericValue);
|
||||
os.writeInt(binaryProperties);
|
||||
|
||||
os.writeByte(generalCategory);
|
||||
os.writeByte(combiningClass);
|
||||
os.writeByte(bidiClass);
|
||||
os.writeByte(decompositionType);
|
||||
os.writeByte(numericType);
|
||||
os.writeByte(eastAsianWidth);
|
||||
os.writeByte(lineBreak);
|
||||
os.writeByte(joiningType);
|
||||
os.writeByte(joiningGroup);
|
||||
os.writeByte(script);
|
||||
os.writeByte(age);
|
||||
}
|
||||
|
||||
public void readBytes(DataInputStream is) throws IOException {
|
||||
codePoint = is.readInt();
|
||||
|
||||
name = readString(is);
|
||||
decompositionMapping = readString(is);
|
||||
simpleUppercase = readString(is);
|
||||
simpleLowercase = readString(is);
|
||||
simpleTitlecase = readString(is);
|
||||
simpleCaseFolding = readString(is);
|
||||
fullUppercase = readString(is);
|
||||
fullLowercase = readString(is);
|
||||
fullTitlecase = readString(is);
|
||||
fullCaseFolding = readString(is);
|
||||
specialCasing = readString(is);
|
||||
bidiMirror = readString(is);
|
||||
|
||||
numericValue = is.readFloat();
|
||||
binaryProperties = is.readInt();
|
||||
|
||||
generalCategory = is.readByte();
|
||||
combiningClass = is.readByte();
|
||||
bidiClass = is.readByte();
|
||||
decompositionType = is.readByte();
|
||||
numericType = is.readByte();
|
||||
eastAsianWidth = is.readByte();
|
||||
lineBreak = is.readByte();
|
||||
joiningType = is.readByte();
|
||||
joiningGroup = is.readByte();
|
||||
script = is.readByte();
|
||||
age = is.readByte();
|
||||
fleshOut();
|
||||
|
||||
// HACK
|
||||
/*
|
||||
int bp = binaryProperties;
|
||||
bp &= ~(1 << CaseFoldTurkishI); // clear bit
|
||||
if (codePoint == 'i' || codePoint == 'I') {
|
||||
bp |= (1 << CaseFoldTurkishI);
|
||||
}
|
||||
if (bp != binaryProperties) {
|
||||
if (!HACK) {
|
||||
System.out.println("\tHACK Resetting CaseFoldTurkishI on U+" + Utility.hex(codePoint) + " " + name + " and others...");
|
||||
HACK = true;
|
||||
}
|
||||
binaryProperties = bp;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
if (generalCategory == Sm) {
|
||||
if ((binaryProperties & Math_PropertyMask) != 0) {
|
||||
if (!HACK) {
|
||||
System.out.println("Stripping " + Utility.hex(codePoint) + " " + name + " and others...");
|
||||
HACK = true;
|
||||
}
|
||||
binaryProperties &= ~Math_PropertyMask;
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
static boolean HACK = false;
|
||||
}
|
1063
tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
Normal file
1063
tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
Normal file
File diff suppressed because it is too large
Load diff
115
tools/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java
Normal file
115
tools/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java
Normal file
|
@ -0,0 +1,115 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
//import java.text.*;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public class WriteJavaScriptInfo {
|
||||
/* TODO: fix enumeration of compositions
|
||||
|
||||
static public void writeJavascriptInfo() throws IOException {
|
||||
System.err.println("Writing Javascript data");
|
||||
UCD ucd = UCD.make();
|
||||
Normalizer normKD = new Normalizer(Normalizer.NFKD);
|
||||
Normalizer normD = new Normalizer(Normalizer.NFD);
|
||||
PrintWriter log = new PrintWriter(new FileOutputStream("Normalization_data.js"));
|
||||
|
||||
int count = 0;
|
||||
int datasize = 0;
|
||||
int max = 0;
|
||||
int over7 = 0;
|
||||
log.println("var KD = new Object(); // NFKD compatibility decomposition mappings");
|
||||
log.println("// NOTE: Hangul is done in code!");
|
||||
CompactShortArray csa = new CompactShortArray((short)0);
|
||||
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
|
||||
if (0xAC00 <= c && c <= 0xD7A3) continue;
|
||||
if (normKD.hasDecomposition(c)) {
|
||||
++count;
|
||||
String decomp = normKD.normalize(c);
|
||||
datasize += decomp.length();
|
||||
if (max < decomp.length()) max = decomp.length();
|
||||
if (decomp.length() > 7) ++over7;
|
||||
csa.setElementAt(c, (short)count);
|
||||
log.println("\t KD[0x" + Utility.hex(c) + "]='\\u" + Utility.hex(decomp,"\\u") + "';");
|
||||
}
|
||||
}
|
||||
csa.compact();
|
||||
log.println("// " + count + " NFKD mappings total");
|
||||
log.println("// " + datasize + " total characters of results");
|
||||
log.println("// " + max + " string length, maximum");
|
||||
log.println("// " + over7 + " result strings with length > 7");
|
||||
log.println("// " + csa.storage() + " trie length (doesn't count string size)");
|
||||
log.println();
|
||||
|
||||
count = 0;
|
||||
datasize = 0;
|
||||
max = 0;
|
||||
log.println("var D = new Object(); // NFD canonical decomposition mappings");
|
||||
log.println("// NOTE: Hangul is done in code!");
|
||||
csa = new CompactShortArray((short)0);
|
||||
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
|
||||
if (0xAC00 <= c && c <= 0xD7A3) continue;
|
||||
if (normD.hasDecomposition(c)) {
|
||||
++count;
|
||||
String decomp = normD.normalize(c);
|
||||
datasize += decomp.length();
|
||||
if (max < decomp.length()) max = decomp.length();
|
||||
csa.setElementAt(c, (short)count);
|
||||
log.println("\t D[0x" + Utility.hex(c) + "]='\\u" + Utility.hex(decomp,"\\u") + "';");
|
||||
}
|
||||
}
|
||||
csa.compact();
|
||||
|
||||
log.println("// " + count + " NFD mappings total");
|
||||
log.println("// " + datasize + " total characters of results");
|
||||
log.println("// " + max + " string length, maximum");
|
||||
log.println("// " + csa.storage() + " trie length (doesn't count string size)");
|
||||
log.println();
|
||||
|
||||
count = 0;
|
||||
datasize = 0;
|
||||
log.println("var CC = new Object(); // canonical class mappings");
|
||||
CompactByteArray cba = new CompactByteArray();
|
||||
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
|
||||
int canClass = normKD.getCanonicalClass(c);
|
||||
if (canClass != 0) {
|
||||
++count;
|
||||
|
||||
log.println("\t CC[0x" + Utility.hex(c) + "]=" + canClass + ";");
|
||||
}
|
||||
}
|
||||
cba.compact();
|
||||
log.println("// " + count + " canonical class mappings total");
|
||||
log.println("// " + cba.storage() + " trie length");
|
||||
log.println();
|
||||
|
||||
count = 0;
|
||||
datasize = 0;
|
||||
log.println("var C = new Object(); // composition mappings");
|
||||
log.println("// NOTE: Hangul is done in code!");
|
||||
|
||||
IntHashtable.IntEnumeration enum = normKD.getD getComposition();
|
||||
while (enum.hasNext()) {
|
||||
int key = enum.next();
|
||||
char val = (char) enum.value();
|
||||
if (0xAC00 <= val && val <= 0xD7A3) continue;
|
||||
++count;
|
||||
log.println("\tC[0x" + Utility.hex(key) + "]=0x" + Utility.hex(val) + ";");
|
||||
}
|
||||
log.println("// " + count + " composition mappings total");
|
||||
log.println();
|
||||
|
||||
log.close();
|
||||
System.err.println("Done writing Javascript data");
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
}
|
38
tools/unicodetools/com/ibm/text/utility/ChainException.java
Normal file
38
tools/unicodetools/com/ibm/text/utility/ChainException.java
Normal file
|
@ -0,0 +1,38 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
|
||||
import java.text.*;
|
||||
import java.io.*;
|
||||
public class ChainException extends RuntimeException {
|
||||
Object[] keyData;
|
||||
String messageFormat;
|
||||
Exception chain;
|
||||
|
||||
public ChainException (String messageFormat, Object[] objects) {
|
||||
this.messageFormat = messageFormat;
|
||||
keyData = (Object[]) objects.clone();
|
||||
}
|
||||
|
||||
public ChainException (String messageFormat, Object[] objects, Exception chainedException) {
|
||||
this.messageFormat = messageFormat;
|
||||
keyData = objects == null ? null : (Object[]) objects.clone();
|
||||
chain = chainedException;
|
||||
}
|
||||
|
||||
public String getMessage() {
|
||||
String chainMsg = "";
|
||||
if (chain != null) {
|
||||
chainMsg = "; " + chain.getClass().getName()
|
||||
+ ", " + chain.getMessage();
|
||||
StringWriter w = new StringWriter();
|
||||
PrintWriter p = new PrintWriter(w);
|
||||
chain.printStackTrace(p);
|
||||
chainMsg += ", " + w.getBuffer();
|
||||
p.close();
|
||||
}
|
||||
String main = "";
|
||||
if (keyData != null) main = MessageFormat.format(messageFormat, keyData);
|
||||
return main + chainMsg;
|
||||
}
|
||||
}
|
||||
|
305
tools/unicodetools/com/ibm/text/utility/CompactByteArray.java
Normal file
305
tools/unicodetools/com/ibm/text/utility/CompactByteArray.java
Normal file
|
@ -0,0 +1,305 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
/*
|
||||
* %W% %E%
|
||||
*
|
||||
* (C) Copyright Taligent, Inc. 1996 - All Rights Reserved
|
||||
* (C) Copyright IBM Corp. 1996 - All Rights Reserved
|
||||
*
|
||||
* Portions copyright (c) 1996 Sun Microsystems, Inc. All Rights Reserved.
|
||||
*
|
||||
* The original version of this source code and documentation is copyrighted
|
||||
* and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
|
||||
* materials are provided under terms of a License Agreement between Taligent
|
||||
* and Sun. This technology is protected by multiple US and International
|
||||
* patents. This notice and attribution to Taligent may not be removed.
|
||||
* Taligent is a registered trademark of Taligent, Inc.
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software
|
||||
* and its documentation for NON-COMMERCIAL purposes and without
|
||||
* fee is hereby granted provided that this copyright notice
|
||||
* appears in all copies. Please refer to the file "copyright.html"
|
||||
* for further important copyright and licensing information.
|
||||
*
|
||||
* SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
|
||||
* THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
* TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
|
||||
* ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
|
||||
* DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
|
||||
*
|
||||
*/
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
*
|
||||
* Provides a compact way to store information that is indexed by Unicode
|
||||
* values, such as character properties, types, keyboard values, etc.
|
||||
* only for internal use for now. Made public for discussion purposes.
|
||||
*
|
||||
* @see CompactIntArray
|
||||
* @see CompactShortArray
|
||||
* @version %I% %G%
|
||||
* @author Helena Shih
|
||||
*/
|
||||
public final class CompactByteArray implements Serializable {
|
||||
|
||||
|
||||
public static final int UNICODECOUNT =65536;
|
||||
|
||||
public CompactByteArray()
|
||||
{
|
||||
this((byte)0);
|
||||
}
|
||||
public CompactByteArray(byte defaultValue)
|
||||
{
|
||||
int i;
|
||||
values = new byte[UNICODECOUNT];
|
||||
indices = new short[INDEXCOUNT];
|
||||
for (i = 0; i < UNICODECOUNT; ++i) {
|
||||
values[i] = defaultValue;
|
||||
}
|
||||
for (i = 0; i < INDEXCOUNT; ++i) {
|
||||
indices[i] = (short)(i<<BLOCKSHIFT);
|
||||
}
|
||||
isCompact = false;
|
||||
}
|
||||
public CompactByteArray(short indexArray[],
|
||||
byte newValues[]) throws IllegalArgumentException
|
||||
{
|
||||
int i;
|
||||
if (indexArray.length != INDEXCOUNT)
|
||||
throw new IllegalArgumentException();
|
||||
for (i = 0; i < INDEXCOUNT; ++i) {
|
||||
short index = indexArray[i];
|
||||
if ((index < 0) || (index >= newValues.length+BLOCKCOUNT))
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
indices = indexArray;
|
||||
values = newValues;
|
||||
isCompact = true;
|
||||
}
|
||||
|
||||
public void writeArrays(PrintWriter output)
|
||||
{
|
||||
int i;
|
||||
output.println("package com.ibm.text.unicode;");
|
||||
output.println("import com.ibm.text.collections.*;");
|
||||
|
||||
output.println("public final class GeneralCategory {");
|
||||
|
||||
output.println(" public static byte getCategory (char ch) {");
|
||||
output.println(" return compactArray.elementAt(ch);");
|
||||
output.println(" }");
|
||||
|
||||
output.println(" static CompactByteArray compactArray;");
|
||||
|
||||
output.println(" static void init () {");
|
||||
output.println(" short[] index = {");
|
||||
for (i = 0; i < indices.length; i++) {
|
||||
if (i % 8 == 0) output.println();
|
||||
output.print("(short)" + (indices[i] & 0xFFFF) + ", ");
|
||||
}
|
||||
output.println(" };");
|
||||
|
||||
output.println(" byte[] data = {");
|
||||
for (i = 0; i < values.length; i++) {
|
||||
if (i % 8 == 0) output.println();
|
||||
output.print("(byte)" + (values[i] & 0xFF) + ", ");
|
||||
}
|
||||
output.println(" };");
|
||||
output.println(" compactArray = new CompactByteArray(index, data);");
|
||||
output.println(" }");
|
||||
output.println("}");
|
||||
output.close();
|
||||
}
|
||||
|
||||
public byte elementAt(char index) // parameterized on byte
|
||||
{
|
||||
return (values[(indices[index >>> BLOCKSHIFT] & 0xFFFF) +
|
||||
(index & BLOCKMASK)]);
|
||||
}
|
||||
// Set automatically expands the array if it is compacted.
|
||||
// parameterized on value (byte)
|
||||
public void setElementAt(char index, byte value)
|
||||
{
|
||||
if (isCompact)
|
||||
expand();
|
||||
values[(int)index] = value;
|
||||
}
|
||||
public void setElementAt(char start, char end, byte value)
|
||||
{
|
||||
int i;
|
||||
if (isCompact) {
|
||||
expand();
|
||||
}
|
||||
for (i = start; i <= end; ++i) {
|
||||
values[i] = value;
|
||||
}
|
||||
}
|
||||
// Compact the array.
|
||||
// The value of cycle determines how large the overlap can be.
|
||||
// A cycle of 1 is the most compacted, but takes the most time to do.
|
||||
// If values stored in the array tend to repeat in cycles of, say, 16,
|
||||
// then using that will be faster than cycle = 1, and get almost the
|
||||
// same compression. cycle is hardcoded as BLOCKCOUNT now.
|
||||
public void compact()
|
||||
{
|
||||
if (isCompact == false) {
|
||||
char[] tempIndex;
|
||||
int tempIndexCount;
|
||||
byte[] tempArray;
|
||||
short iBlock, iIndex;
|
||||
|
||||
// make temp storage, larger than we need
|
||||
tempIndex = new char[UNICODECOUNT];
|
||||
// set up first block.
|
||||
tempIndexCount = BLOCKCOUNT;
|
||||
for (iIndex = 0; iIndex < BLOCKCOUNT; ++iIndex) {
|
||||
tempIndex[iIndex] = (char)iIndex;
|
||||
}; // endfor (iIndex = 0; .....)
|
||||
indices[0] = (short)0;
|
||||
|
||||
// for each successive block, find out its first position
|
||||
// in the compacted array
|
||||
for (iBlock = 1; iBlock < INDEXCOUNT; ++iBlock) {
|
||||
int newCount, firstPosition, block;
|
||||
block = iBlock<<BLOCKSHIFT;
|
||||
if (DEBUGSMALL) if (block > DEBUGSMALLLIMIT) break;
|
||||
firstPosition = FindOverlappingPosition( block, tempIndex,
|
||||
tempIndexCount );
|
||||
|
||||
newCount = firstPosition + BLOCKCOUNT;
|
||||
if (newCount > tempIndexCount) {
|
||||
for (iIndex = (short)tempIndexCount;
|
||||
iIndex < newCount;
|
||||
++iIndex) {
|
||||
tempIndex[iIndex] = (char)
|
||||
(iIndex - firstPosition + block);
|
||||
} // endfor (iIndex = tempIndexCount....)
|
||||
tempIndexCount = newCount;
|
||||
} // endif (newCount > tempIndexCount)
|
||||
indices[iBlock] = (short)firstPosition;
|
||||
} // endfor (iBlock = 1.....)
|
||||
|
||||
// now allocate and copy the items into the array
|
||||
tempArray = new byte[tempIndexCount];
|
||||
for (iIndex = 0; iIndex < tempIndexCount; ++iIndex) {
|
||||
tempArray[iIndex] = values[tempIndex[iIndex]];
|
||||
}
|
||||
values = null;
|
||||
values = tempArray;
|
||||
isCompact = true;
|
||||
} // endif (isCompact != false)
|
||||
}
|
||||
// Expanded takes the array back to a 65536 element array
|
||||
public void expand()
|
||||
{
|
||||
int i;
|
||||
if (isCompact) {
|
||||
byte[] tempArray;
|
||||
tempArray = new byte[UNICODECOUNT];
|
||||
for (i = 0; i < UNICODECOUNT; ++i) {
|
||||
tempArray[i] = elementAt((char)i);
|
||||
}
|
||||
for (i = 0; i < INDEXCOUNT; ++i) {
|
||||
indices[i] = (short)(i<<BLOCKSHIFT);
|
||||
}
|
||||
values = null;
|
||||
values = tempArray;
|
||||
isCompact = false;
|
||||
}
|
||||
}
|
||||
// Print char Array : Debug only
|
||||
public void printIndex(short start, short count)
|
||||
{
|
||||
int i;
|
||||
for (i = start; i < count; ++i)
|
||||
{
|
||||
System.out.println(i + " -> : " +
|
||||
(int)((indices[i] >= 0) ?
|
||||
indices[i] :
|
||||
indices[i] + UNICODECOUNT));
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
public void printPlainArray(int start,int count, char[] tempIndex)
|
||||
{
|
||||
int iIndex;
|
||||
if (tempIndex != null)
|
||||
{
|
||||
for (iIndex = start; iIndex < start + count; ++iIndex)
|
||||
{
|
||||
System.out.print(" " + (int)values[tempIndex[iIndex]]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (iIndex = start; iIndex < start + count; ++iIndex)
|
||||
{
|
||||
System.out.print(" " + (int)values[iIndex]);
|
||||
}
|
||||
}
|
||||
System.out.println(" Range: start " + start + " , count " + count);
|
||||
}
|
||||
// # of elements in the indexed array
|
||||
public short capacity()
|
||||
{
|
||||
return (short)values.length;
|
||||
}
|
||||
|
||||
public int storage()
|
||||
{
|
||||
return values.length * 1 + indices.length * 2 + 12;
|
||||
}
|
||||
|
||||
private byte[] getArray()
|
||||
{
|
||||
return values;
|
||||
}
|
||||
private int
|
||||
FindOverlappingPosition(int start, char[] tempIndex, int tempIndexCount)
|
||||
{
|
||||
int i;
|
||||
short j;
|
||||
short currentCount;
|
||||
|
||||
if (DEBUGOVERLAP && start < DEBUGSHOWOVERLAPLIMIT) {
|
||||
printPlainArray(start, BLOCKCOUNT, null);
|
||||
printPlainArray(0, tempIndexCount, tempIndex);
|
||||
}
|
||||
for (i = 0; i < tempIndexCount; i += BLOCKCOUNT) {
|
||||
currentCount = (short)BLOCKCOUNT;
|
||||
if (i + BLOCKCOUNT > tempIndexCount) {
|
||||
currentCount = (short)(tempIndexCount - i);
|
||||
}
|
||||
for (j = 0; j < currentCount; ++j) {
|
||||
if (values[start + j] != values[tempIndex[i + j]]) break;
|
||||
}
|
||||
if (j == currentCount) break;
|
||||
}
|
||||
if (DEBUGOVERLAP && start < DEBUGSHOWOVERLAPLIMIT) {
|
||||
for (j = 1; j < i; ++j) {
|
||||
System.out.print(" ");
|
||||
}
|
||||
printPlainArray(start, BLOCKCOUNT, null);
|
||||
System.out.println(" Found At: " + i);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
private static final int DEBUGSHOWOVERLAPLIMIT = 100;
|
||||
private static final boolean DEBUGTRACE = false;
|
||||
private static final boolean DEBUGSMALL = false;
|
||||
private static final boolean DEBUGOVERLAP = false;
|
||||
private static final int DEBUGSMALLLIMIT = 30000;
|
||||
private static final int BLOCKSHIFT =6;
|
||||
private static final int BLOCKCOUNT =(1<<BLOCKSHIFT);
|
||||
private static final int INDEXSHIFT =(16-BLOCKSHIFT);
|
||||
private static final int INDEXCOUNT =(1<<INDEXSHIFT);
|
||||
private static final int BLOCKMASK = BLOCKCOUNT - 1;
|
||||
|
||||
private byte[] values; // char -> short (char parameterized short)
|
||||
private short indices[];
|
||||
private boolean isCompact;
|
||||
};
|
367
tools/unicodetools/com/ibm/text/utility/CompactShortArray.java
Normal file
367
tools/unicodetools/com/ibm/text/utility/CompactShortArray.java
Normal file
|
@ -0,0 +1,367 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
|
||||
/*
|
||||
* %W% %E%
|
||||
*
|
||||
* (C) Copyright Taligent, Inc. 1996 - All Rights Reserved
|
||||
* (C) Copyright IBM Corp. 1996 - All Rights Reserved
|
||||
*
|
||||
* Portions copyright (c) 1996 Sun Microsystems, Inc. All Rights Reserved.
|
||||
*
|
||||
* The original version of this source code and documentation is copyrighted
|
||||
* and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
|
||||
* materials are provided under terms of a License Agreement between Taligent
|
||||
* and Sun. This technology is protected by multiple US and International
|
||||
* patents. This notice and attribution to Taligent may not be removed.
|
||||
* Taligent is a registered trademark of Taligent, Inc.
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software
|
||||
* and its documentation for NON-COMMERCIAL purposes and without
|
||||
* fee is hereby granted provided that this copyright notice
|
||||
* appears in all copies. Please refer to the file "copyright.html"
|
||||
* for further important copyright and licensing information.
|
||||
*
|
||||
* SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
|
||||
* THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
* TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
|
||||
* ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
|
||||
* DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
|
||||
*
|
||||
*/
|
||||
|
||||
import java.io.*;
|
||||
import java.lang.*;
|
||||
/**
|
||||
* class CompactATypeArray : use only on primitive data types
|
||||
* Provides a compact way to store information that is indexed by Unicode
|
||||
* values, such as character properties, types, keyboard values, etc.This
|
||||
* is very useful when you have a block of Unicode data that contains
|
||||
* significant values while the rest of the Unicode data is unused in the
|
||||
* application or when you have a lot of redundance, such as where all 21,000
|
||||
* Han ideographs have the same value. However, lookup is much faster than a
|
||||
* hash table.
|
||||
* A compact array of any primitive data type serves two purposes:
|
||||
* <UL type = round>
|
||||
* <LI>Fast access of the indexed values.
|
||||
* <LI>Smaller memory footprint.
|
||||
* </UL>
|
||||
* A compact array is composed of a index array and value array. The index
|
||||
* array contains the indicies of Unicode characters to the value array.
|
||||
* @see CompactByteArray
|
||||
* @see CompactIntArray
|
||||
* @see CompactCharArray
|
||||
* @see CompactStringArray
|
||||
* @version %I% %G%
|
||||
* @author Helena Shih
|
||||
*/
|
||||
public final class CompactShortArray implements Serializable {
|
||||
|
||||
|
||||
/**
|
||||
* The total number of Unicode characters.
|
||||
*/
|
||||
public static final int UNICODECOUNT =65536;
|
||||
|
||||
/**
|
||||
* Default constructor for CompactShortArray, the default value of the
|
||||
* compact array is 0.
|
||||
*/
|
||||
public CompactShortArray()
|
||||
{
|
||||
this((short)0);
|
||||
}
|
||||
/**
|
||||
* Constructor for CompactShortArray.
|
||||
* @param defaultValue the default value of the compact array.
|
||||
*/
|
||||
public CompactShortArray(short defaultValue)
|
||||
{
|
||||
int i;
|
||||
values = new short[UNICODECOUNT];
|
||||
indices = new short[INDEXCOUNT];
|
||||
for (i = 0; i < UNICODECOUNT; ++i) {
|
||||
values[i] = defaultValue;
|
||||
}
|
||||
for (i = 0; i < INDEXCOUNT; ++i) {
|
||||
indices[i] = (short)(i<<BLOCKSHIFT);
|
||||
}
|
||||
isCompact = false;
|
||||
}
|
||||
/**
|
||||
* Constructor for CompactShortArray.
|
||||
* @param indexArray the indicies of the compact array.
|
||||
* @param newValues the values of the compact array.
|
||||
* @exception IllegalArgumentException If the index is out of range.
|
||||
*/
|
||||
public CompactShortArray(short indexArray[],
|
||||
short newValues[]) throws IllegalArgumentException
|
||||
{
|
||||
int i;
|
||||
if (indexArray.length != INDEXCOUNT)
|
||||
throw new IllegalArgumentException("Index out of bounds.");
|
||||
for (i = 0; i < INDEXCOUNT; ++i) {
|
||||
short index = indexArray[i];
|
||||
if ((index < 0) || (index >= newValues.length+BLOCKCOUNT))
|
||||
throw new IllegalArgumentException("Index out of bounds.");
|
||||
}
|
||||
indices = indexArray;
|
||||
values = newValues;
|
||||
}
|
||||
/**
|
||||
* Get the mapped value of a Unicode character.
|
||||
* @param index the character to get the mapped value with
|
||||
* @return the mapped value of the given character
|
||||
*/
|
||||
public short elementAt(char index) // parameterized on short
|
||||
{
|
||||
return (values[(indices[index >> BLOCKSHIFT] & 0xFFFF)
|
||||
+ (index & BLOCKMASK)]);
|
||||
}
|
||||
/**
|
||||
* Set a new value for a Unicode character.
|
||||
* Set automatically expands the array if it is compacted.
|
||||
* @param index the character to set the mapped value with
|
||||
* @param value the new mapped value
|
||||
*/
|
||||
public void setElementAt(char index, short value)
|
||||
{
|
||||
if (isCompact)
|
||||
expand();
|
||||
values[(int)index] = value;
|
||||
}
|
||||
/**
|
||||
* Set new values for a range of Unicode character.
|
||||
* @param start the starting offset of the range
|
||||
* @param end the ending offset of the range
|
||||
* @param value the new mapped value
|
||||
*/
|
||||
public void setElementAt(char start, char end, short value)
|
||||
{
|
||||
int i;
|
||||
if (isCompact) {
|
||||
expand();
|
||||
}
|
||||
for (i = start; i <= end; ++i) {
|
||||
values[i] = value;
|
||||
}
|
||||
}
|
||||
/**
|
||||
*Compact the array.
|
||||
*/
|
||||
public void compact()
|
||||
{
|
||||
if (isCompact == false) {
|
||||
char[] tempIndex;
|
||||
int tempIndexCount;
|
||||
short[] tempArray;
|
||||
short iBlock, iIndex;
|
||||
|
||||
// make temp storage, larger than we need
|
||||
tempIndex = new char[UNICODECOUNT];
|
||||
// set up first block.
|
||||
tempIndexCount = BLOCKCOUNT;
|
||||
for (iIndex = 0; iIndex < BLOCKCOUNT; ++iIndex) {
|
||||
tempIndex[iIndex] = (char)iIndex;
|
||||
}; // endfor (iIndex = 0; .....)
|
||||
indices[0] = (short)0;
|
||||
|
||||
// for each successive block, find out its first position
|
||||
// in the compacted array
|
||||
for (iBlock = 1; iBlock < INDEXCOUNT; ++iBlock) {
|
||||
int newCount, firstPosition, block;
|
||||
block = iBlock<<BLOCKSHIFT;
|
||||
if (DEBUGSMALL) if (block > DEBUGSMALLLIMIT) break;
|
||||
firstPosition = FindOverlappingPosition(block, tempIndex,
|
||||
tempIndexCount);
|
||||
|
||||
newCount = firstPosition + BLOCKCOUNT;
|
||||
if (newCount > tempIndexCount) {
|
||||
for (iIndex = (short)tempIndexCount;
|
||||
iIndex < newCount;
|
||||
++iIndex) {
|
||||
tempIndex[iIndex]
|
||||
= (char)(iIndex - firstPosition + block);
|
||||
} // endfor (iIndex = tempIndexCount....)
|
||||
tempIndexCount = newCount;
|
||||
} // endif (newCount > tempIndexCount)
|
||||
indices[iBlock] = (short)firstPosition;
|
||||
} // endfor (iBlock = 1.....)
|
||||
|
||||
// now allocate and copy the items into the array
|
||||
tempArray = new short[tempIndexCount];
|
||||
for (iIndex = 0; iIndex < tempIndexCount; ++iIndex) {
|
||||
tempArray[iIndex] = values[tempIndex[iIndex]];
|
||||
}
|
||||
values = null;
|
||||
values = tempArray;
|
||||
isCompact = true;
|
||||
} // endif (isCompact != false)
|
||||
}
|
||||
/** For internal use only. Do not modify the result, the behavior of
|
||||
* modified results are undefined.
|
||||
*/
|
||||
public short getIndexArray()[]
|
||||
{
|
||||
return indices;
|
||||
}
|
||||
/** For internal use only. Do not modify the result, the behavior of
|
||||
* modified results are undefined.
|
||||
*/
|
||||
public short getStringArray()[]
|
||||
{
|
||||
return values;
|
||||
}
|
||||
// --------------------------------------------------------------
|
||||
// package private
|
||||
// --------------------------------------------------------------
|
||||
void writeArrays()
|
||||
{
|
||||
int i;
|
||||
int cnt = ((values.length > 0) ? values.length :
|
||||
(values.length + UNICODECOUNT));
|
||||
System.out.println("{");
|
||||
for (i = 0; i < INDEXCOUNT-1; i++)
|
||||
{
|
||||
System.out.print("(short)" + (int)((getIndexArrayValue(i) >= 0) ?
|
||||
(int)getIndexArrayValue(i) :
|
||||
(int)(getIndexArrayValue(i)+UNICODECOUNT)) + ", ");
|
||||
if (i != 0)
|
||||
if (i % 10 == 0)
|
||||
System.out.println();
|
||||
}
|
||||
System.out.println("(short)" +
|
||||
(int)((getIndexArrayValue(INDEXCOUNT-1) >= 0) ?
|
||||
(int)getIndexArrayValue(i) :
|
||||
(int)(getIndexArrayValue(i)+UNICODECOUNT)) +
|
||||
" }");
|
||||
System.out.println("{");
|
||||
for (i = 0; i < cnt-1; i++)
|
||||
{
|
||||
System.out.print("(short)" + (int)getArrayValue(i) + ", ");
|
||||
if (i != 0)
|
||||
if (i % 10 == 0)
|
||||
System.out.println();
|
||||
}
|
||||
System.out.println("(short)" + (int)getArrayValue(cnt-1) + " }");
|
||||
}
|
||||
// Print char Array : Debug only
|
||||
void printIndex(short start, short count)
|
||||
{
|
||||
int i;
|
||||
for (i = start; i < count; ++i)
|
||||
{
|
||||
System.out.println(i + " -> : " +
|
||||
(int)((indices[i] >= 0) ?
|
||||
indices[i] :
|
||||
indices[i] + UNICODECOUNT));
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
void printPlainArray(int start,int count, char[] tempIndex)
|
||||
{
|
||||
int iIndex;
|
||||
if (tempIndex != null)
|
||||
{
|
||||
for (iIndex = start; iIndex < start + count; ++iIndex)
|
||||
{
|
||||
System.out.print(" " + (int)getArrayValue(tempIndex[iIndex]));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (iIndex = start; iIndex < start + count; ++iIndex)
|
||||
{
|
||||
System.out.print(" " + (int)getArrayValue(iIndex));
|
||||
}
|
||||
}
|
||||
System.out.println(" Range: start " + start + " , count " + count);
|
||||
}
|
||||
// --------------------------------------------------------------
|
||||
// private
|
||||
// --------------------------------------------------------------
|
||||
/**
|
||||
* Expanding takes the array back to a 65536 element array.
|
||||
*/
|
||||
private void expand()
|
||||
{
|
||||
int i;
|
||||
if (isCompact) {
|
||||
short[] tempArray;
|
||||
tempArray = new short[UNICODECOUNT];
|
||||
for (i = 0; i < UNICODECOUNT; ++i) {
|
||||
tempArray[i] = elementAt((char)i);
|
||||
}
|
||||
for (i = 0; i < INDEXCOUNT; ++i) {
|
||||
indices[i] = (short)(i<<BLOCKSHIFT);
|
||||
}
|
||||
values = null;
|
||||
values = tempArray;
|
||||
isCompact = false;
|
||||
}
|
||||
}
|
||||
// # of elements in the indexed array
|
||||
private short capacity()
|
||||
{
|
||||
return (short)values.length;
|
||||
}
|
||||
public int storage()
|
||||
{
|
||||
return values.length * 2 + indices.length * 2 + 12;
|
||||
}
|
||||
|
||||
private short getArrayValue(int n)
|
||||
{
|
||||
return values[n];
|
||||
}
|
||||
private short getIndexArrayValue(int n)
|
||||
{
|
||||
return indices[n];
|
||||
}
|
||||
private int
|
||||
FindOverlappingPosition(int start, char[] tempIndex, int tempIndexCount)
|
||||
{
|
||||
int i;
|
||||
short j;
|
||||
short currentCount;
|
||||
|
||||
if (DEBUGOVERLAP && start < DEBUGSHOWOVERLAPLIMIT) {
|
||||
printPlainArray(start, BLOCKCOUNT, null);
|
||||
printPlainArray(0, tempIndexCount, tempIndex);
|
||||
}
|
||||
for (i = 0; i < tempIndexCount; i += BLOCKCOUNT) {
|
||||
currentCount = (short)BLOCKCOUNT;
|
||||
if (i + BLOCKCOUNT > tempIndexCount) {
|
||||
currentCount = (short)(tempIndexCount - i);
|
||||
}
|
||||
for (j = 0; j < currentCount; ++j) {
|
||||
if (values[start + j] != values[tempIndex[i + j]]) break;
|
||||
}
|
||||
if (j == currentCount) break;
|
||||
}
|
||||
if (DEBUGOVERLAP && start < DEBUGSHOWOVERLAPLIMIT) {
|
||||
for (j = 1; j < i; ++j) {
|
||||
System.out.print(" ");
|
||||
}
|
||||
printPlainArray(start, BLOCKCOUNT, null);
|
||||
System.out.println(" Found At: " + i);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
private static final int DEBUGSHOWOVERLAPLIMIT = 100;
|
||||
private static final boolean DEBUGTRACE = false;
|
||||
private static final boolean DEBUGSMALL = false;
|
||||
private static final boolean DEBUGOVERLAP = false;
|
||||
private static final int DEBUGSMALLLIMIT = 30000;
|
||||
private static final int BLOCKSHIFT =7;
|
||||
private static final int BLOCKCOUNT =(1<<BLOCKSHIFT);
|
||||
private static final int INDEXSHIFT =(16-BLOCKSHIFT);
|
||||
private static final int INDEXCOUNT =(1<<INDEXSHIFT);
|
||||
private static final int BLOCKMASK = BLOCKCOUNT - 1;
|
||||
|
||||
private short values[]; // char -> short (char parameterized short)
|
||||
private short indices[];
|
||||
private boolean isCompact;
|
||||
};
|
65
tools/unicodetools/com/ibm/text/utility/Counter.java
Normal file
65
tools/unicodetools/com/ibm/text/utility/Counter.java
Normal file
|
@ -0,0 +1,65 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
//import com.ibm.text.unicode.UInfo;
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
import java.text.*;
|
||||
|
||||
public final class Counter {
|
||||
Map map = new HashMap();
|
||||
|
||||
static public final class RWInteger implements Comparable {
|
||||
static int uniqueCount;
|
||||
public int value;
|
||||
private int forceUnique = uniqueCount++;
|
||||
|
||||
// public RWInteger() {
|
||||
// forceUnique
|
||||
|
||||
public int compareTo(Object other) {
|
||||
RWInteger that = (RWInteger) other;
|
||||
if (that.value < value) return -1;
|
||||
else if (that.value > value) return 1;
|
||||
else if (that.forceUnique < forceUnique) return -1;
|
||||
else if (that.forceUnique > forceUnique) return 1;
|
||||
return 0;
|
||||
}
|
||||
public String toString() {
|
||||
return String.valueOf(value);
|
||||
}
|
||||
}
|
||||
|
||||
public void add(String obj) {
|
||||
RWInteger count = (RWInteger)map.get(obj);
|
||||
if (count == null) {
|
||||
count = new RWInteger();
|
||||
map.put(obj, count);
|
||||
}
|
||||
count.value += obj.length();
|
||||
}
|
||||
|
||||
public Map getSortedByCount() {
|
||||
Map result = new TreeMap();
|
||||
Iterator it = map.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Object key = it.next();
|
||||
Object count = map.get(key);
|
||||
result.put(count, key);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public Map getKeyToKey() {
|
||||
Map result = new HashMap();
|
||||
Iterator it = map.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Object key = it.next();
|
||||
result.put(key, key);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
}
|
164
tools/unicodetools/com/ibm/text/utility/Differ.java
Normal file
164
tools/unicodetools/com/ibm/text/utility/Differ.java
Normal file
|
@ -0,0 +1,164 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
/** Basic Diff program. Compares two sequences of objects fed into it, and
|
||||
* lets you know where they are different. For a usage example, see DifferTest
|
||||
* @author Mark Davis
|
||||
* @version 1.0
|
||||
*/
|
||||
|
||||
final public class Differ {
|
||||
public static final String copyright =
|
||||
"Copyright (C) 2000, International Business Machines Corporation and others. All Rights Reserved.";
|
||||
|
||||
/**
|
||||
* @param stackSize The size of the largest difference you expect.
|
||||
* @param matchCount The number of items that have to be the same to count as a match
|
||||
*/
|
||||
public Differ(int stackSize, int matchCount) {
|
||||
this.STACKSIZE = stackSize;
|
||||
this.EQUALSIZE = matchCount;
|
||||
a = new Object[stackSize+matchCount];
|
||||
b = new Object[stackSize+matchCount];
|
||||
}
|
||||
|
||||
public void add (Object aStr, Object bStr) {
|
||||
addA(aStr);
|
||||
addB(bStr);
|
||||
}
|
||||
|
||||
public void addA (Object aStr) {
|
||||
flush();
|
||||
a[aCount++] = aStr;
|
||||
}
|
||||
|
||||
public void addB (Object bStr) {
|
||||
flush();
|
||||
b[bCount++] = bStr;
|
||||
}
|
||||
|
||||
public int getALine(int offset) {
|
||||
return aLine + maxSame + offset;
|
||||
}
|
||||
|
||||
public Object getA(int offset) {
|
||||
if (offset < 0) return last;
|
||||
if (offset > aTop-maxSame) return next;
|
||||
return a[offset];
|
||||
}
|
||||
|
||||
public int getACount() {
|
||||
return aTop-maxSame;
|
||||
}
|
||||
|
||||
public int getBCount() {
|
||||
return bTop-maxSame;
|
||||
}
|
||||
|
||||
public int getBLine(int offset) {
|
||||
return bLine + maxSame + offset;
|
||||
}
|
||||
|
||||
public Object getB(int offset) {
|
||||
if (offset < 0) return last;
|
||||
if (offset > bTop-maxSame) return next;
|
||||
return b[offset];
|
||||
}
|
||||
|
||||
public void checkMatch(boolean finalPass) {
|
||||
// find the initial strings that are the same
|
||||
int max = aCount;
|
||||
if (max > bCount) max = bCount;
|
||||
int i;
|
||||
for (i = 0; i < max; ++i) {
|
||||
if (!a[i].equals(b[i])) break;
|
||||
}
|
||||
// at this point, all items up to i are equal
|
||||
maxSame = i;
|
||||
aTop = bTop = maxSame;
|
||||
if (maxSame > 0) last = a[maxSame-1];
|
||||
next = "";
|
||||
|
||||
if (finalPass) {
|
||||
aTop = aCount;
|
||||
bTop = bCount;
|
||||
next = "";
|
||||
return;
|
||||
}
|
||||
|
||||
if (aCount - maxSame < EQUALSIZE || bCount - maxSame < EQUALSIZE) return;
|
||||
|
||||
// now see if the last few a's occur anywhere in the b's, or vice versa
|
||||
int match = find (a, aCount-EQUALSIZE, aCount, b, maxSame, bCount);
|
||||
if (match != -1) {
|
||||
aTop = aCount-EQUALSIZE;
|
||||
bTop = match;
|
||||
next = a[aTop];
|
||||
return;
|
||||
}
|
||||
match = find (b, bCount-EQUALSIZE, bCount, a, maxSame, aCount);
|
||||
if (match != -1) {
|
||||
bTop = bCount-EQUALSIZE;
|
||||
aTop = match;
|
||||
next = b[bTop];
|
||||
return;
|
||||
}
|
||||
if (aCount >= STACKSIZE || bCount >= STACKSIZE) {
|
||||
// flush some of them
|
||||
aCount = (aCount + maxSame) / 2;
|
||||
bCount = (bCount + maxSame) / 2;
|
||||
next = "";
|
||||
}
|
||||
}
|
||||
|
||||
/** Convenient utility
|
||||
* finds a segment of the first array in the second array.
|
||||
* @return -1 if not found, otherwise start position in b
|
||||
*/
|
||||
|
||||
public int find (Object[] a, int aStart, int aEnd, Object[] b, int bStart, int bEnd) {
|
||||
int len = aEnd - aStart;
|
||||
int bEndMinus = bEnd - len;
|
||||
tryA:
|
||||
for (int i = bStart; i <= bEndMinus; ++i) {
|
||||
for (int j = 0; j < len; ++j) {
|
||||
if (!b[i + j].equals(a[aStart + j])) continue tryA;
|
||||
}
|
||||
return i; // we have a match!
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// ====================== PRIVATES ======================
|
||||
|
||||
private void flush() {
|
||||
if (aTop != 0) {
|
||||
int newCount = aCount-aTop;
|
||||
System.arraycopy(a, aTop, a, 0, newCount);
|
||||
aCount = newCount;
|
||||
aLine += aTop;
|
||||
aTop = 0;
|
||||
}
|
||||
|
||||
if (bTop != 0) {
|
||||
int newCount = bCount-bTop;
|
||||
System.arraycopy(b, bTop, b, 0, newCount);
|
||||
bCount = newCount;
|
||||
bLine += bTop;
|
||||
bTop = 0;
|
||||
}
|
||||
}
|
||||
|
||||
private int STACKSIZE;
|
||||
private int EQUALSIZE;
|
||||
|
||||
private Object [] a;
|
||||
private Object [] b;
|
||||
private Object last = "";
|
||||
private Object next = "";
|
||||
private int aCount = 0;
|
||||
private int bCount = 0;
|
||||
private int aLine = 1;
|
||||
private int bLine = 1;
|
||||
private int maxSame = 0, aTop = 0, bTop = 0;
|
||||
|
||||
}
|
37
tools/unicodetools/com/ibm/text/utility/DifferTest.java
Normal file
37
tools/unicodetools/com/ibm/text/utility/DifferTest.java
Normal file
|
@ -0,0 +1,37 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
|
||||
public class DifferTest {
|
||||
public static final String copyright =
|
||||
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
|
||||
|
||||
static final void main(String[] args) { // for testing
|
||||
|
||||
String[] as = {"a", "b", "20D4", "0344", "20D5", "20D6", "20D7", "20D8", "20D9"};
|
||||
String[] bs = {"a", "b", "20D4", "20D5", "0344", "20D6", "20D7", "20D8", "20D9"};
|
||||
Differ differ = new Differ(50,2);
|
||||
int max = as.length;
|
||||
if (max < bs.length) max = bs.length;
|
||||
for (int j = 0; j <= max; ++j) {
|
||||
if (j < as.length) differ.addA(as[j]);
|
||||
if (j < bs.length) differ.addB(bs[j]);
|
||||
differ.checkMatch(j == max);
|
||||
|
||||
if (differ.getACount() != 0 || differ.getBCount() != 0) {
|
||||
if (differ.getACount() != 0) {
|
||||
for (int i = -1; i < differ.getACount()+1; ++i) {
|
||||
System.out.println("a: " + differ.getALine(i) + " " + differ.getA(i));
|
||||
}
|
||||
}
|
||||
if (differ.getBCount() != 0) {
|
||||
if (differ.getACount() != 0) System.out.println();
|
||||
for (int i = -1; i < differ.getBCount()+1; ++i) {
|
||||
System.out.println("b: " + differ.getBLine(i) + " " + differ.getB(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
System.out.println("----");
|
||||
//differ.flush();
|
||||
}
|
||||
}
|
||||
}
|
52
tools/unicodetools/com/ibm/text/utility/DualWriter.java
Normal file
52
tools/unicodetools/com/ibm/text/utility/DualWriter.java
Normal file
|
@ -0,0 +1,52 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
import java.awt.*;
|
||||
import java.io.*;
|
||||
|
||||
final public class DualWriter extends Writer {
|
||||
private static final String copyright = "(C) Copyright IBM Corp. 1998 - All Rights Reserved";
|
||||
// Abstract class for writing to character streams.
|
||||
// The only methods that a subclass must implement are
|
||||
// write(char[], int, int), flush(), and close().
|
||||
|
||||
private boolean autoflush ;
|
||||
private Writer a;
|
||||
private Writer b;
|
||||
|
||||
public DualWriter (Writer a, Writer b) {
|
||||
this.a = a;
|
||||
this.b = b;
|
||||
}
|
||||
|
||||
public DualWriter (Writer a, Writer b, boolean autoFlush) {
|
||||
this.a = a;
|
||||
this.b = b;
|
||||
autoflush = autoFlush;
|
||||
}
|
||||
|
||||
public void setAutoFlush(boolean value) {
|
||||
autoflush = value;
|
||||
}
|
||||
|
||||
public boolean getAutoFlush() {
|
||||
return autoflush;
|
||||
}
|
||||
|
||||
public void write(char cbuf[],
|
||||
int off,
|
||||
int len) throws IOException {
|
||||
a.write(cbuf, off, len);
|
||||
b.write(cbuf, off, len);
|
||||
if (autoflush) flush();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
a.close();
|
||||
b.close();
|
||||
}
|
||||
|
||||
public void flush() throws IOException {
|
||||
a.flush();
|
||||
b.flush();
|
||||
}
|
||||
}
|
152
tools/unicodetools/com/ibm/text/utility/EquivalenceClass.java
Normal file
152
tools/unicodetools/com/ibm/text/utility/EquivalenceClass.java
Normal file
|
@ -0,0 +1,152 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
public class EquivalenceClass {
|
||||
static final boolean DEBUG = false;
|
||||
/**
|
||||
* Takes a many:many relation between source and value.
|
||||
* Produces equivalence class.
|
||||
* Two sources are in the same equivalence class any time they share the same value.
|
||||
*/
|
||||
// associated with each value, we keep a set of sources.
|
||||
// whenever we add a <source, value> pair, we see if any sets collide.
|
||||
// associated with each set of sources, we keep a representative Whenever we add to the set, if we
|
||||
//
|
||||
Map sourceToEquiv = new HashMap();
|
||||
Map valueToRepresentativeSource = new HashMap();
|
||||
Map forcedMerge = new HashMap();
|
||||
/**
|
||||
* @return true if made a difference
|
||||
*/
|
||||
|
||||
String itemSeparator;
|
||||
int places;
|
||||
boolean hex;
|
||||
|
||||
public EquivalenceClass() {
|
||||
this(",", 4, true);
|
||||
}
|
||||
|
||||
public EquivalenceClass(String itemSeparator, int places, boolean hex) {
|
||||
this.itemSeparator = itemSeparator;
|
||||
this.places = places;
|
||||
this.hex = hex;
|
||||
}
|
||||
|
||||
public boolean add(Object source, Object value) {
|
||||
boolean result = false;
|
||||
Object repSource = valueToRepresentativeSource.get(value);
|
||||
Set equivSet = (Set)sourceToEquiv.get(source);
|
||||
Set fm = (Set)forcedMerge.get(source);
|
||||
if (fm == null) {
|
||||
fm = new TreeSet();
|
||||
forcedMerge.put(source, fm);
|
||||
}
|
||||
|
||||
if (DEBUG) System.out.println("+Source " + source
|
||||
+ ", value: " + value);
|
||||
if (repSource == null && equivSet == null) {
|
||||
equivSet = new HashSet();
|
||||
equivSet.add(source);
|
||||
sourceToEquiv.put(source, equivSet);
|
||||
valueToRepresentativeSource.put(value, source);
|
||||
repSource = source; // for debugging
|
||||
} else if (equivSet == null) {
|
||||
equivSet = (Set) sourceToEquiv.get(repSource);
|
||||
equivSet.add(source);
|
||||
sourceToEquiv.put(source, equivSet);
|
||||
result = true;
|
||||
} else if (repSource == null) {
|
||||
valueToRepresentativeSource.put(value, source);
|
||||
repSource = source; // for debugging;
|
||||
} else { // both non-null
|
||||
Set repEquiv = (Set) sourceToEquiv.get(repSource);
|
||||
if (!repEquiv.equals(equivSet)) {
|
||||
|
||||
result = true;
|
||||
if (DEBUG) System.out.println("Merging (" + repSource + ") " + toString(repEquiv)
|
||||
+ " + (" + source + ") " + toString(equivSet));
|
||||
// merge!!
|
||||
// put all items from equivSet into repEquiv
|
||||
repEquiv.addAll(equivSet);
|
||||
|
||||
// now add the values to the forced sets
|
||||
Iterator it = repEquiv.iterator();
|
||||
while (it.hasNext()) {
|
||||
Object n = it.next();
|
||||
fm = (Set)forcedMerge.get(n);
|
||||
fm.add(value);
|
||||
}
|
||||
|
||||
// then replace all instances for equivSet by repEquiv
|
||||
// we have to do this in two steps, since iterators are invalidated by changes
|
||||
Set toReplace = new HashSet();
|
||||
it = sourceToEquiv.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Object otherSource = it.next();
|
||||
Set otherSet = (Set) sourceToEquiv.get(otherSource);
|
||||
if (otherSet == equivSet) {
|
||||
toReplace.add(otherSource);
|
||||
}
|
||||
}
|
||||
it = toReplace.iterator();
|
||||
while (it.hasNext()) {
|
||||
Object otherSource = it.next();
|
||||
sourceToEquiv.put(otherSource,repEquiv);
|
||||
}
|
||||
equivSet = repEquiv; // for debugging
|
||||
}
|
||||
}
|
||||
if (DEBUG) System.out.println("--- repSource: " + repSource
|
||||
+ ", equivSet: " + equivSet);
|
||||
return result;
|
||||
}
|
||||
|
||||
public String toString () {
|
||||
StringBuffer result = new StringBuffer();
|
||||
// make a set to skip duplicates
|
||||
Iterator it = new HashSet(sourceToEquiv.values()).iterator();
|
||||
while (it.hasNext()) {
|
||||
toString((Set)it.next(), result, forcedMerge);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
private String toString(Object s) {
|
||||
if (s == null) return "null";
|
||||
if (s instanceof Collection) {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
toString((Collection)s, sb, null);
|
||||
return sb.toString();
|
||||
}
|
||||
if (hex && s instanceof Number) {
|
||||
return Utility.hex(s, places);
|
||||
}
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
private void toString(Collection s, StringBuffer sb, Map valueToRep) {
|
||||
if (sb.length() != 0) sb.append(itemSeparator);
|
||||
if (s == null) {
|
||||
sb.append("{}");
|
||||
return;
|
||||
}
|
||||
sb.append('{');
|
||||
Iterator it = s.iterator();
|
||||
boolean notFirst = false;
|
||||
while (it.hasNext()) {
|
||||
if (notFirst) sb.append(", ");
|
||||
notFirst = true;
|
||||
Object n = it.next();
|
||||
sb.append(toString(n));
|
||||
/*if (valueToRep != null) {
|
||||
sb.append("(" + toString(valueToRep.get(n)) + ")");
|
||||
}*/
|
||||
}
|
||||
sb.append('}');
|
||||
}
|
||||
|
||||
}
|
113
tools/unicodetools/com/ibm/text/utility/IndentWriter.java
Normal file
113
tools/unicodetools/com/ibm/text/utility/IndentWriter.java
Normal file
|
@ -0,0 +1,113 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
public class IndentWriter extends Writer {
|
||||
public IndentWriter(Writer writer) {
|
||||
this.writer = writer;
|
||||
this.width = 30000;
|
||||
this.separator = " ";
|
||||
}
|
||||
public IndentWriter(OutputStream writer, String encoding)
|
||||
throws UnsupportedEncodingException{
|
||||
this.writer = new OutputStreamWriter(writer, encoding);
|
||||
this.width = 30000;
|
||||
this.separator = " ";
|
||||
}
|
||||
public void setSeparator(String separator) {
|
||||
this.separator = separator;
|
||||
}
|
||||
public String getSeparator() {
|
||||
return separator;
|
||||
}
|
||||
public void setWidth(int width) {
|
||||
this.width = width;
|
||||
}
|
||||
public int getWidth() {
|
||||
return width;
|
||||
}
|
||||
public void indentBy(int indentDelta) throws IOException {
|
||||
this.indent += indentDelta;
|
||||
flush();
|
||||
}
|
||||
public void setIndent(int indent) {
|
||||
this.indent = indent;
|
||||
}
|
||||
public int getIndent() {
|
||||
return indent;
|
||||
}
|
||||
/*
|
||||
public void write(String cbuf, int off, int len) throws IOException {
|
||||
if (buffer.length() + len > width) {
|
||||
flushLine();
|
||||
buffer.append(" ".substring(0,indent));
|
||||
buffer.append("(" + indent + ") ");
|
||||
} else {
|
||||
buffer.append(separator);
|
||||
}
|
||||
buffer.append(cbuf, off, len);
|
||||
}
|
||||
public void write(String string) throws IOException {
|
||||
write(string,0,string.length());
|
||||
}
|
||||
*/
|
||||
public void write(int indent, String string) throws IOException {
|
||||
setIndent(indent);
|
||||
write(string,0,string.length());
|
||||
}
|
||||
public void writeln(int indent, String string) throws IOException {
|
||||
write(indent, string);
|
||||
flushLine();
|
||||
}
|
||||
public void writeln(String string) throws IOException {
|
||||
write(string);
|
||||
flushLine();
|
||||
}
|
||||
public void writeln() throws IOException {
|
||||
flushLine();
|
||||
}
|
||||
|
||||
public void write(char cbuf[], int off, int len) throws IOException {
|
||||
if (buffer.length() == 0) {
|
||||
bufferIndent = indent;
|
||||
} else if (bufferIndent + buffer.length() + separator.length() + len > width) {
|
||||
flushLine();
|
||||
} else {
|
||||
buffer.append(separator);
|
||||
}
|
||||
buffer.append(cbuf, off, len);
|
||||
}
|
||||
|
||||
public void flushLine() throws IOException {
|
||||
if (buffer.length() != 0) { // indent
|
||||
writer.write(" ",0,bufferIndent);
|
||||
writer.write(buffer.toString());
|
||||
writer.write(EOL);
|
||||
buffer.setLength(0);
|
||||
}
|
||||
}
|
||||
|
||||
public void flush() throws IOException {
|
||||
flushLine();
|
||||
writer.flush();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
flush();
|
||||
writer.close();
|
||||
}
|
||||
private Writer writer;
|
||||
private StringBuffer buffer = new StringBuffer(200);
|
||||
private int width;
|
||||
private int indent;
|
||||
private int bufferIndent;
|
||||
private String separator;
|
||||
private static String EOL;
|
||||
static { // gets platform-specific eol
|
||||
StringWriter foo = new StringWriter();
|
||||
PrintWriter fii = new PrintWriter(foo);
|
||||
fii.println();
|
||||
fii.flush();
|
||||
EOL = foo.toString();
|
||||
}
|
||||
}
|
41
tools/unicodetools/com/ibm/text/utility/IntStack.java
Normal file
41
tools/unicodetools/com/ibm/text/utility/IntStack.java
Normal file
|
@ -0,0 +1,41 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
// =============================================================
|
||||
// Simple stack mechanism, with push, pop and access
|
||||
// =============================================================
|
||||
|
||||
public final class IntStack {
|
||||
private int[] values;
|
||||
private int top = 0;
|
||||
|
||||
public IntStack(int initialSize) {
|
||||
values = new int[initialSize];
|
||||
}
|
||||
|
||||
public void push(int value) {
|
||||
if (top >= values.length) { // must grow?
|
||||
int[] temp = new int[values.length*2];
|
||||
System.arraycopy(values,0,temp,0,values.length);
|
||||
values = temp;
|
||||
}
|
||||
values[top++] = value;
|
||||
}
|
||||
|
||||
public int pop() {
|
||||
if (top > 0) return values[--top];
|
||||
throw new IllegalArgumentException("Stack underflow");
|
||||
}
|
||||
|
||||
public int get(int index) {
|
||||
if (0 <= index && index < top) return values[index];
|
||||
throw new IllegalArgumentException("Stack index out of bounds");
|
||||
}
|
||||
|
||||
public int getTop() {
|
||||
return top;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return top == 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public final class LengthFirstComparator implements Comparator {
|
||||
public int compare(Object a, Object b) {
|
||||
String as = (String) a;
|
||||
String bs = (String) b;
|
||||
if (as.length() < bs.length()) return -1;
|
||||
if (as.length() > bs.length()) return 1;
|
||||
return as.compareTo(bs);
|
||||
}
|
||||
}
|
31
tools/unicodetools/com/ibm/text/utility/Pair.java
Normal file
31
tools/unicodetools/com/ibm/text/utility/Pair.java
Normal file
|
@ -0,0 +1,31 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
public final class Pair implements java.lang.Comparable {
|
||||
|
||||
public Comparable first, second;
|
||||
|
||||
public Pair (Comparable first, Comparable second) {
|
||||
this.first = first;
|
||||
this.second = second;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return first.hashCode() * 37 + second.hashCode();
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
try {
|
||||
Pair that = (Pair)other;
|
||||
return first.equals(that.first) && second.equals(that.second);
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public int compareTo(Object other) {
|
||||
Pair that = (Pair)other;
|
||||
int trial = first.compareTo(that.first);
|
||||
if (trial != 0) return trial;
|
||||
return second.compareTo(that.second);
|
||||
}
|
||||
}
|
8
tools/unicodetools/com/ibm/text/utility/UTF16Plus.java
Normal file
8
tools/unicodetools/com/ibm/text/utility/UTF16Plus.java
Normal file
|
@ -0,0 +1,8 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
public class UTF16Plus {
|
||||
public static int charAt(StringBuffer source, int offset16) {
|
||||
return UTF32.char32At(source, offset16);
|
||||
}
|
||||
}
|
||||
|
718
tools/unicodetools/com/ibm/text/utility/UTF32.java
Normal file
718
tools/unicodetools/com/ibm/text/utility/UTF32.java
Normal file
|
@ -0,0 +1,718 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
/**
|
||||
* Utility class for demonstrating UTF16 character conversions and indexing conversions.
|
||||
* Ideally, these methods would be on existing classes in Java, but they can also be used
|
||||
* in a stand-alone utility class like this one.
|
||||
* <p>Code that uses strings alone rarely need modification.
|
||||
* By design, UTF-16 does not allow overlap, so searching for strings is a safe operation.
|
||||
* Similarly, concatenation is always safe. Substringing is safe if the start and end are both
|
||||
* on UTF32 boundaries. In normal code, the values for start and end are on those boundaries,
|
||||
* since they arose from operations like searching.
|
||||
* If not, the nearest UTF-32 boundaries can be determined using <code>bounds32()</code>.
|
||||
* <p>Here is a summary of the methods:
|
||||
* <ul><li>
|
||||
* <code>char32At()</code>, <code>count32()</code>, and <code>append32()</code>
|
||||
* are most important methods for most programs.
|
||||
* They are used for iteration, filtering and copying. See the examples below.
|
||||
* </li><li>
|
||||
* <code>bounds32()</code> is useful for finding the nearest UTF-32 boundaries.
|
||||
* However, in most circumstances it is better to use
|
||||
* <a <a href="http://java.sun.com/products/jdk/1.2/docs/api/java/text/BreakIterator.html#getCharacterInstance(java.util.Locale)">
|
||||
* BreakIterator.getCharacterInstance(Locale)</a> to find character boundaries
|
||||
* that are closer to end-user expectations.
|
||||
* </li><li>
|
||||
* <code>valueOf32()</code> is occasionally convenient for producing a string containing a UTF-32 value.
|
||||
* </li><li>
|
||||
* <code>findOffset16()</code> and <code>findOffset32()</code> are generally not needed,
|
||||
* except when interfacing to specifications that use UTF-32 indices (such as XSL).
|
||||
* </li><li>
|
||||
* <code>isLegal()</code> can be used to test whether UTF-16 or UTF-32 values are valid.
|
||||
* </li><li>
|
||||
* <code>isLeadSurrogate()</code>, <code>isSurrogate()</code>, and <code>isTrailSurrogate()</code>
|
||||
* test the type of a char. They are useful for lower-level code.
|
||||
* </li><li>
|
||||
* <code>getChar32()</code>, <code>getLead()</code>, and <code>getTrail()</code>
|
||||
* are sometimes useful for putting together and taking apart UTF-32 values.
|
||||
* </li></ul>
|
||||
* <strong>Examples:</strong>
|
||||
* <p>The following examples illustrate use of some of these methods.
|
||||
<pre>
|
||||
// iteration forwards: Original
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
char ch = s.charAt(i);
|
||||
doSomethingWith(ch);
|
||||
}
|
||||
|
||||
// iteration forwards: Changes for UTF-32
|
||||
int ch;
|
||||
for (int i = 0; i < s.length(); i+=UTF32.count16(ch)) {
|
||||
ch = UTF32.char32At(s,i);
|
||||
doSomethingWith(ch);
|
||||
}
|
||||
|
||||
// iteration backwards: Original
|
||||
for (int i = s.length()-1; i >= 0; --i) {
|
||||
char ch = s.charAt(i);
|
||||
doSomethingWith(ch);
|
||||
}
|
||||
|
||||
// iteration backwards: Changes for UTF-32
|
||||
int ch;
|
||||
for (int i = s.length()-1; i > 0; i-=UTF32.count16(ch)) {
|
||||
ch = UTF32.char32At(s,i);
|
||||
doSomethingWith(ch);
|
||||
}
|
||||
|
||||
* </pre>
|
||||
* <strong>Notes:</strong>
|
||||
* <ul><li>
|
||||
* <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code> and <code>Trail</code> in the API,
|
||||
* which gives a better sense of their ordering in a string. <code>offset16</code> and <code>offset32</code> are used to distinguish
|
||||
* offsets to UTF-16 boundaries vs offsets to UTF-32 boundaries.
|
||||
* <code>int char32</code> is used to contain UTF-32 characters, as opposed to <code>char</code>, which is a UTF-16 code unit.
|
||||
* </li><li>
|
||||
* <strong>Roundtripping Offsets:</strong> You can always roundtrip
|
||||
* from a UTF-32 offset to a UTF-16 offset and back.
|
||||
* Because of the difference in structure, you can roundtrip
|
||||
* from a UTF-16 offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>.
|
||||
* </li><li>
|
||||
* <strong>Exceptions:</strong> The error checking will throw an exception if indices are out of bounds.
|
||||
* Other than than that, all methods will behave reasonably,
|
||||
* even if unmatched surrogates or out-of-bounds UTF-32 values are present.
|
||||
* <code>isLegal()</code> can be used to check for validity if desired.
|
||||
* </li><li>
|
||||
* <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then these are
|
||||
* counted as one UTF-32 value. This matches their iteration behavior, which is vital.
|
||||
* It also matches common display practice as
|
||||
* missing glyphs (see the Unicode Standard Section 5.4, 5.5).
|
||||
* </li><li>
|
||||
* <strong>Out-of-bounds UTF-32 values:</strong> If a <code>char32</code> contains an out-of-bounds UTF-32 value,
|
||||
* then it is treated as REPLACEMENT_CHAR for consistency across the API.
|
||||
* </li><li>
|
||||
* <strong>Optimization:</strong> The method implementations may need optimization if the compiler doesn't fold static final methods.
|
||||
* Since surrogate pairs will form an exceeding small percentage of all the text in the world,
|
||||
* the singleton case should always be optimized for.
|
||||
* </li></ul>
|
||||
* @author Mark Davis, with help from Markus Scherer
|
||||
*/
|
||||
public final class UTF32 {
|
||||
|
||||
// =========================================================
|
||||
// UTILITIES
|
||||
// =========================================================
|
||||
|
||||
/**
|
||||
* Unicode value used when translating into Unicode encoding form
|
||||
* and there is no existing character.
|
||||
*/
|
||||
public static final char REPLACEMENT_CHAR = '\uFFFD';
|
||||
|
||||
/**
|
||||
* Value returned in <code><a href="#bounds32(java.lang.String, int)">bounds32()</a></code>.
|
||||
*/
|
||||
public static final int SINGLE = 1, LEAD = 2, TRAIL = 5;
|
||||
|
||||
/**
|
||||
* Determines how many chars this char32 requires.
|
||||
* If a validity check is required, use <code><a href="#isLegal(char)">isLegal()</a></code>
|
||||
* on char32 before calling.
|
||||
* <p><i>If this were integrated into the Java API, it could be a static method of either Character or String.</i>
|
||||
* @return 2 if is in surrogate space, otherwise 1.
|
||||
* @param ch the input character.
|
||||
*/
|
||||
public static int count16(int char32) {
|
||||
if (char32 < MIN_SUPPLEMENTARY) return 1;
|
||||
return 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a single UTF-32 value from a string.
|
||||
* Used when iterating forwards or backwards (with <code>count16()</code>, as well as random access.
|
||||
* If a validity check is required, use <code><a href="#isLegal(char)">isLegal()</a></code> on the return value.
|
||||
* <p><i>If this were integrated into the Java API, it could be a method of String, StringBuffer and possibly CharacterIterator.</i>
|
||||
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
|
||||
* The boundaries of that codepoint are the same as in <code>bounds32()</code>.
|
||||
* @param source array of UTF-16 chars
|
||||
* @param offset16 UTF-16 offset to the start of the character.
|
||||
*/
|
||||
public static int char32At(String source, int offset16) {
|
||||
char single = source.charAt(offset16);
|
||||
if (!isSurrogate(single)) return single;
|
||||
|
||||
try { // use exception to catch out-of-bounds
|
||||
|
||||
// Convert the UTF-16 surrogate pair if necessary.
|
||||
// For simplicity in usage, and because the frequency of pairs is low,
|
||||
// look both directions.
|
||||
|
||||
if (isLeadSurrogate(single)) {
|
||||
char trail = source.charAt(++offset16);
|
||||
if (isTrailSurrogate(trail)) {
|
||||
return ((int)single << SURROGATE_SHIFT) + trail + SURROGATE_OFFSET;
|
||||
}
|
||||
} else { // isTrailSurrogate(single), so
|
||||
char lead = source.charAt(--offset16);
|
||||
if (isLeadSurrogate(lead)) {
|
||||
return ((int)lead << SURROGATE_SHIFT) + single + SURROGATE_OFFSET;
|
||||
}
|
||||
}
|
||||
} catch (StringIndexOutOfBoundsException e) {}
|
||||
return single; // return unmatched surrogate
|
||||
}
|
||||
|
||||
public static int char32At(StringBuffer source, int offset16) {
|
||||
char single = source.charAt(offset16);
|
||||
if (!isSurrogate(single)) return single;
|
||||
|
||||
try { // use exception to catch out-of-bounds
|
||||
|
||||
// Convert the UTF-16 surrogate pair if necessary.
|
||||
// For simplicity in usage, and because the frequency of pairs is low,
|
||||
// look both directions.
|
||||
|
||||
if (isLeadSurrogate(single)) {
|
||||
char trail = source.charAt(++offset16);
|
||||
if (isTrailSurrogate(trail)) {
|
||||
return ((int)single << SURROGATE_SHIFT) + trail + SURROGATE_OFFSET;
|
||||
}
|
||||
} else { // isTrailSurrogate(single), so
|
||||
char lead = source.charAt(--offset16);
|
||||
if (isLeadSurrogate(lead)) {
|
||||
return ((int)lead << SURROGATE_SHIFT) + single + SURROGATE_OFFSET;
|
||||
}
|
||||
}
|
||||
} catch (StringIndexOutOfBoundsException e) {}
|
||||
return single; // return unmatched surrogate
|
||||
}
|
||||
|
||||
public static int char32At(char[] source, int start16, int end16, int offset16) {
|
||||
if (offset16 < start16 || offset16 >= end16) {
|
||||
throw new ArrayIndexOutOfBoundsException(offset16);
|
||||
}
|
||||
|
||||
char single = source[offset16];
|
||||
if (!isSurrogate(single)) return single;
|
||||
|
||||
try { // use exception to catch out-of-bounds
|
||||
|
||||
// Convert the UTF-16 surrogate pair if necessary.
|
||||
// For simplicity in usage, and because the frequency of pairs is low,
|
||||
// look both directions.
|
||||
|
||||
if (isLeadSurrogate(single)) {
|
||||
++offset16;
|
||||
if (offset16 >= end16) return single;
|
||||
char trail = source[offset16];
|
||||
if (isTrailSurrogate(trail)) {
|
||||
return ((int)single << SURROGATE_SHIFT) + trail + SURROGATE_OFFSET;
|
||||
}
|
||||
} else { // isTrailSurrogate(single), so
|
||||
char lead = source[--offset16];
|
||||
if (isLeadSurrogate(lead)) {
|
||||
return ((int)lead << SURROGATE_SHIFT) + single + SURROGATE_OFFSET;
|
||||
}
|
||||
}
|
||||
} catch (ArrayIndexOutOfBoundsException e) {}
|
||||
return single; // return unmatched surrogate
|
||||
}
|
||||
|
||||
|
||||
// moral equivalent of valueOf32(charAt32(x)), but no memory alloc
|
||||
public static String getCodePointSubstring(String s, int offset16) {
|
||||
switch(bounds32(s,offset16)) {
|
||||
default: return s.substring(offset16,offset16+1);
|
||||
case LEAD: return s.substring(offset16,offset16+2);
|
||||
case TRAIL: return s.substring(offset16-1,offset16+1);
|
||||
}
|
||||
}
|
||||
|
||||
// moral equivalent of valueOf32(charAt32(x)), but no memory alloc
|
||||
public static String getCodePointSubstring(StringBuffer s, int offset16) {
|
||||
switch(bounds32(s,offset16)) {
|
||||
default: return s.substring(offset16,offset16+1);
|
||||
case LEAD: return s.substring(offset16,offset16+2);
|
||||
case TRAIL: return s.substring(offset16-1,offset16+1);
|
||||
}
|
||||
}
|
||||
|
||||
public static int append32(char[] output, int oPosition, int oEnd, int cp) {
|
||||
if (oPosition >= oEnd) throw new ArrayIndexOutOfBoundsException(oPosition);
|
||||
output[oPosition++] = UTF32.getLead(cp);
|
||||
if (UTF32.count16(cp) != 1) {
|
||||
if (oPosition >= oEnd) throw new ArrayIndexOutOfBoundsException(oPosition);
|
||||
output[oPosition++] = UTF32.getTrail(cp);
|
||||
}
|
||||
return oPosition;
|
||||
}
|
||||
|
||||
public static void setChar32At(StringBuffer b, int position, int codePoint) {
|
||||
int type = bounds32(b, position);
|
||||
// handle simple cases: #chars at position match #chars in codePoint
|
||||
int end = position;
|
||||
switch (type) {
|
||||
case SINGLE:
|
||||
if (isSupplementary(codePoint)) break;
|
||||
b.setCharAt(position, (char)codePoint);
|
||||
return;
|
||||
case LEAD:
|
||||
if (!isSupplementary(codePoint)) {
|
||||
++end;
|
||||
break;
|
||||
}
|
||||
b.setCharAt(position++, (char)getLead(codePoint));
|
||||
b.setCharAt(position, (char)getTrail(codePoint));
|
||||
return;
|
||||
case TRAIL:
|
||||
if (!isSupplementary(codePoint)) {
|
||||
--position;
|
||||
break;
|
||||
}
|
||||
b.setCharAt(position++, (char)getLead(codePoint));
|
||||
b.setCharAt(position, (char)getTrail(codePoint));
|
||||
return;
|
||||
}
|
||||
// mismatch, just use long form
|
||||
b.replace(position, end+1, valueOf32(codePoint));
|
||||
}
|
||||
|
||||
/**
|
||||
* See if a char value is legal. It can't be:
|
||||
* <ul><li>Not-a-character (either \\uFFFF or\\uFFFE).
|
||||
* The datatype char itself prevents out of bounds errors.
|
||||
* </li></ul>
|
||||
* Note: legal does not mean that it is assigned in this version of Unicode.
|
||||
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
|
||||
* @param UTF-32 value to test
|
||||
* @return true iff legal.
|
||||
*/
|
||||
public static boolean isLegal(char char16) {
|
||||
return (char16 < 0xFFFE);
|
||||
}
|
||||
|
||||
/**
|
||||
* See if a UTF32 value is legal. It can't be:
|
||||
* <ul>
|
||||
* <li>Out of bounds (less than 0 or greater than MAX_UNICODE)</li>
|
||||
* <li>A surrogate value (00D800 to 00DCFF)</li>
|
||||
* <li>Not-a-character (of the form xxFFFF or xxFFFE)</li>
|
||||
* </ul>
|
||||
* Note: legal does not mean that it is assigned in this version of Unicode.
|
||||
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
|
||||
* @param char32 UTF-32 value to test
|
||||
* @return true iff legal.
|
||||
*/
|
||||
public static boolean isLegal(int char32) {
|
||||
if (char32 < 0) return false;
|
||||
//if (char32 < SURROGATE_BASE) return true;
|
||||
//if (char32 < SURROGATE_LIMIT) return false;
|
||||
if ((char32 & PLANE_MASK) >= NON_CHARACTER_BASE) return false;
|
||||
return (char32 <= MAX_UNICODE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the code unit OR code point is a surrogate.
|
||||
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
|
||||
* @return true iff the input character is a surrogate.
|
||||
* @param ch the input character.
|
||||
*/
|
||||
public static boolean isSurrogate(int char32) {
|
||||
return (SURROGATE_BASE <= char32 && char32 < SURROGATE_LIMIT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the code point is a supplementary.
|
||||
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
|
||||
* @return true iff the input character is a surrogate.
|
||||
* @param ch the input character.
|
||||
*/
|
||||
public static boolean isSupplementary(int char32) {
|
||||
return (char32 >= MIN_SUPPLEMENTARY && char32 <= MAX_UNICODE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the code point is a supplementary.
|
||||
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
|
||||
* @return true iff the input character is a surrogate.
|
||||
* @param ch the input character.
|
||||
*/
|
||||
public static boolean isBasic(int char32) {
|
||||
return (char32 >= 0 && char32 < MIN_SUPPLEMENTARY);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the character is a trail surrogate.
|
||||
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
|
||||
* @return true iff the input character is a trail surrogate.
|
||||
* @param ch the input character.
|
||||
*/
|
||||
public static boolean isTrailSurrogate(char ch) {
|
||||
return (TRAIL_BASE <= ch && ch < TRAIL_LIMIT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the character is a lead surrogate.
|
||||
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
|
||||
* @return true iff the input character is a lead surrogate.
|
||||
* @param ch the input character.
|
||||
*/
|
||||
public static boolean isLeadSurrogate(char ch) {
|
||||
return (LEAD_BASE <= ch && ch < LEAD_LIMIT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the lead surrogate.
|
||||
* If a validity check is required, use <code><a href="#isLegal(char)">isLegal()</a></code> on char32 before calling.
|
||||
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
|
||||
* @return lead surrogate if the count16(ch) is 2;
|
||||
* <br>otherwise the character itself
|
||||
* @param char32 the input character.
|
||||
*/
|
||||
public static char getLead(int char32) {
|
||||
if (char32 >= MIN_SUPPLEMENTARY) {
|
||||
return (char)(LEAD_BASE_OFFSET + (char32 >> SURROGATE_SHIFT));
|
||||
}
|
||||
return (char)char32;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the trail surrogate.
|
||||
* If a validity check is required, use <code><a href="#isLegal(char)">isLegal()</a></code> on char32 before calling.
|
||||
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
|
||||
* @return the trail surrogate if the count16(ch) is 2;
|
||||
* <br>and 0 otherwise (note: 0 is not a valid lead surrogate).
|
||||
* @param char32 the input character.
|
||||
*/
|
||||
public static char getTrail(int char32) {
|
||||
if (char32 >= MIN_SUPPLEMENTARY) {
|
||||
return (char)(TRAIL_BASE + (char32 & TRAIL_MASK));
|
||||
}
|
||||
return '\u0000';
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method corresponding to String.valueOf(char). It returns a one or two char string containing
|
||||
* the UTF-32 value. If the input value can't be converted, it substitutes REPLACEMENT_CHAR.
|
||||
* If a validity check is required, use <code><a href="#isLegal(char)">isLegal()</a></code> before calling.
|
||||
* <p><i>If this were integrated into the Java API, it could be a static method of String.</i>
|
||||
* @return string value of char32
|
||||
* @param ch the input character.
|
||||
*/
|
||||
public static String valueOf32(int char32) {
|
||||
if (char32 < 0 || MAX_UNICODE < char32) return String.valueOf(REPLACEMENT_CHAR);
|
||||
if (char32 < MIN_SUPPLEMENTARY) return String.valueOf((char)char32);
|
||||
synchronized (buf2) { // saves allocations
|
||||
buf2[0] = (char)(LEAD_BASE_OFFSET + (char32 >> SURROGATE_SHIFT));
|
||||
buf2[1] = (char)(TRAIL_BASE + (char32 & TRAIL_MASK));
|
||||
return String.valueOf(buf2);
|
||||
}
|
||||
}
|
||||
private static char[] buf2 = new char[2]; // used to avoid allocations
|
||||
|
||||
/**
|
||||
* Returns the UTF-32 character corresponding to the two chars.
|
||||
* If a validity check is required, check the arguments with
|
||||
* <code>isLeadSurrogate()</code> and <code>isTrailSurrogate()</code>, respectively before calling.
|
||||
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
|
||||
* @return the UTF-32 character, or REPLACEMENT_CHAR if invalid.
|
||||
* @param lead the lead char
|
||||
* @param lead the trail char
|
||||
*/
|
||||
public static int getChar32(char lead, char trail) {
|
||||
if (isLeadSurrogate(lead) && isTrailSurrogate(trail)) {
|
||||
return (lead <<= SURROGATE_SHIFT) + trail + SURROGATE_OFFSET;
|
||||
}
|
||||
return REPLACEMENT_CHAR;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the type of the UTF32 boundaries around the char at offset16.
|
||||
* Used for random access.
|
||||
* <p><i>If this were integrated into the Java API, it could be a method of String, StringBuffer and possibly CharacterIterator.</i>
|
||||
* @return SINGLE, FIRST, or SECOND:
|
||||
* <ul><li>
|
||||
* SINGLE: a single char; the bounds are [offset16, offset16+1]
|
||||
* </li><li>
|
||||
* LEAD: a surrogate pair starting at offset16; the bounds are [offset16, offset16+2]
|
||||
* </li><li>
|
||||
* TRAIL: a surrogate pair starting at offset16-1; the bounds are [offset16-1, offset16+1]
|
||||
* </ul>
|
||||
* For bit-twiddlers, the return values for these are chosen so that the boundaries can be gotten by:
|
||||
* [offset16 - (value>>2), offset16 + (value&3)].
|
||||
* @param source text to analyse
|
||||
* @param offset16 UTF-16 offset
|
||||
* @exception StringIndexOutOfBoundsException if offset16 is out of bounds.
|
||||
*/
|
||||
public static int bounds32(String source, int offset16) {
|
||||
char ch = source.charAt(offset16);
|
||||
if (isSurrogate(ch)) {
|
||||
if (isLeadSurrogate(ch)) {
|
||||
if (++offset16 < source.length()
|
||||
&& isTrailSurrogate(source.charAt(offset16))) return LEAD;
|
||||
} else { // isTrailSurrogate(ch), so
|
||||
if (--offset16 >= 0
|
||||
&& isLeadSurrogate(source.charAt(offset16))) return TRAIL;
|
||||
}
|
||||
}
|
||||
return SINGLE;
|
||||
}
|
||||
|
||||
public static int bounds32(StringBuffer source, int offset16) {
|
||||
char ch = source.charAt(offset16);
|
||||
if (isSurrogate(ch)) {
|
||||
if (isLeadSurrogate(ch)) {
|
||||
if (++offset16 < source.length()
|
||||
&& isTrailSurrogate(source.charAt(offset16))) return LEAD;
|
||||
} else { // isTrailSurrogate(ch), so
|
||||
if (--offset16 >= 0
|
||||
&& isLeadSurrogate(source.charAt(offset16))) return TRAIL;
|
||||
}
|
||||
}
|
||||
return SINGLE;
|
||||
}
|
||||
|
||||
// should be renamed bounds
|
||||
|
||||
public static int bounds32(char[] source, int oStart, int oEnd, int offset16) {
|
||||
if (offset16 < oStart || offset16 >= oEnd) {
|
||||
throw new ArrayIndexOutOfBoundsException(offset16);
|
||||
}
|
||||
char ch = source[offset16];
|
||||
if (isSurrogate(ch)) {
|
||||
if (isLeadSurrogate(ch)) {
|
||||
if (++offset16 < oEnd
|
||||
&& isTrailSurrogate(source[offset16])) return LEAD;
|
||||
} else { // isTrailSurrogate(ch), so
|
||||
if (--offset16 >= oStart
|
||||
&& isLeadSurrogate(source[offset16])) return TRAIL;
|
||||
}
|
||||
}
|
||||
return SINGLE;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Returns the UTF-16 offset that corresponds to a UTF-32 offset.
|
||||
* Used for random access. See the <a name="_top_">class description</a>
|
||||
* for notes on roundtripping.
|
||||
* <p><i>If this were integrated into the Java API, it could be a method of String, StringBuffer and possibly CharacterIterator.</i>
|
||||
* @return UTF-16 offset
|
||||
* @param offset32 UTF-32 offset
|
||||
* @param source text to analyse
|
||||
* @exception StringIndexOutOfBoundsException if offset32 is out of bounds.
|
||||
*/
|
||||
public static int findOffset16(String source, int offset32) {
|
||||
int remaining = offset32; // for decrementing
|
||||
boolean hadLeadSurrogate = false;
|
||||
int i;
|
||||
|
||||
for (i = 0; remaining > 0 && i < source.length(); ++i) {
|
||||
char ch = source.charAt(i);
|
||||
if (hadLeadSurrogate && isTrailSurrogate(ch)) {
|
||||
hadLeadSurrogate = false; // count valid trail as zero
|
||||
} else {
|
||||
hadLeadSurrogate = isLeadSurrogate(ch);
|
||||
--remaining; // count others as 1
|
||||
}
|
||||
}
|
||||
|
||||
// if we didn't use up all of remaining (or if we started < 0)
|
||||
// then it is beyond the bounds
|
||||
|
||||
if (remaining != 0) throw new StringIndexOutOfBoundsException(offset32);
|
||||
|
||||
// special check for last surrogate if needed, for consistency with
|
||||
// other situations
|
||||
|
||||
if (hadLeadSurrogate && i < source.length() && isTrailSurrogate(source.charAt(i))) {
|
||||
++i; // grab extra unicode
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given UTF-16 offset.
|
||||
* Used for random access. See the <a name="_top_">class description</a>
|
||||
* for notes on roundtripping.
|
||||
* <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then
|
||||
* the UTF-32 offset of the <strong>end</strong> of the pair is returned.</i>
|
||||
* <p>To find the UTF-32 length of a string, use:
|
||||
* <pre>
|
||||
* len32 = getOffset32(source, source.length());
|
||||
* </pre>
|
||||
* <p><i>If this were integrated into the Java API, it could be a methods of String, StringBuffer and possibly CharacterIterator.</i>
|
||||
* @return UTF-32 offset
|
||||
* @param source text to analyse
|
||||
* @param offset16 UTF-16 offset
|
||||
* @exception StringIndexOutOfBoundsException if offset16 is out of bounds.
|
||||
*/
|
||||
public static int findOffset32(String source, int offset16) {
|
||||
int result = 0;
|
||||
boolean hadLeadSurrogate = false;
|
||||
for (int i = 0; i < offset16; ++i) {
|
||||
char ch = source.charAt(i);
|
||||
if (hadLeadSurrogate && isTrailSurrogate(ch)) {
|
||||
hadLeadSurrogate = false; // count valid trail as zero
|
||||
} else {
|
||||
hadLeadSurrogate = isLeadSurrogate(ch);
|
||||
++result; // count others as 1
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static int length32(String source) {
|
||||
return findOffset32(source, source.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* Append a single UTF-32 value to the end of a StringBuffer.
|
||||
* If a validity check is required, use <code><a href="#isLegal(char)">isLegal()</a></code> on char32 before calling.
|
||||
* <p><i>If this were integrated into the Java API, it could be a method of StringBuffer.</i>
|
||||
* @param char32 value to append. If out of bounds, substitutes REPLACEMENT_CHAR.
|
||||
* @param target string to add to
|
||||
*/
|
||||
public static void append32(StringBuffer target, int char32) {
|
||||
|
||||
// Check for irregular values
|
||||
|
||||
if (char32 < 0 || char32 > MAX_UNICODE) char32 = REPLACEMENT_CHAR;
|
||||
|
||||
// Write the UTF-16 values
|
||||
|
||||
if (char32 >= MIN_SUPPLEMENTARY) {
|
||||
target.append((char)(LEAD_BASE_OFFSET + (char32 >> SURROGATE_SHIFT)));
|
||||
target.append((char)(TRAIL_BASE + (char32 & TRAIL_MASK)));
|
||||
} else {
|
||||
target.append((char)char32);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare strings using Unicode code point order, instead of UTF-16 code unit order.
|
||||
*/
|
||||
public static final class StringComparator implements java.util.Comparator {
|
||||
/**
|
||||
* Standard String compare. Only one small section is different, marked in the code.
|
||||
*/
|
||||
public int compare(Object a, Object b) {
|
||||
if (a == b) {
|
||||
return 0;
|
||||
}
|
||||
if (a == null) {
|
||||
return -1;
|
||||
} else if (b == null) {
|
||||
return 1;
|
||||
}
|
||||
String sa = (String) a;
|
||||
String sb = (String) b;
|
||||
int lena = sa.length();
|
||||
int lenb = sb.length();
|
||||
int len = lena;
|
||||
if (len > lenb) len = lenb;
|
||||
for (int i = 0; i < len; ++i) {
|
||||
char ca = sa.charAt(i);
|
||||
char cb = sb.charAt(i);
|
||||
if (ca == cb) continue; // skip remap if equal
|
||||
|
||||
// start of only different section
|
||||
if (ca >= 0xD800) { // reshuffle to get right codepoint order
|
||||
ca += (ca < 0xE000) ? 0x2000 : -0x800;
|
||||
}
|
||||
if (cb >= 0xD800) { // reshuffle to get right codepoint order
|
||||
cb += (cb < 0xE000) ? 0x2000 : -0x800;
|
||||
}
|
||||
// end of only different section
|
||||
|
||||
if (ca < cb) return -1;
|
||||
return 1; // wasn't equal, so return 1
|
||||
}
|
||||
if (lena < lenb) return -1;
|
||||
if (lena > lenb) return 1;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================================
|
||||
// PRIVATES
|
||||
// ===========================================================
|
||||
|
||||
/**
|
||||
* Prevent instance from being created.
|
||||
*/
|
||||
private UTF32() {}
|
||||
|
||||
/**
|
||||
* Maximum code point values for UTF-32.
|
||||
*/
|
||||
private static final int MAX_UNICODE = 0x10FFFF;
|
||||
|
||||
/**
|
||||
* Maximum values for Basic code points (BMP).
|
||||
*/
|
||||
private static final int MAX_BASIC = 0xFFFF;
|
||||
|
||||
/**
|
||||
* Minimum value for Supplementary code points (SMP).
|
||||
*/
|
||||
private static final int MIN_SUPPLEMENTARY = 0x10000;
|
||||
|
||||
/**
|
||||
* Used to mask off single plane in checking for NON_CHARACTER
|
||||
*/
|
||||
private static final int PLANE_MASK = 0xFFFF;
|
||||
|
||||
/**
|
||||
* Range of non-characters in each plane
|
||||
*/
|
||||
private static final int
|
||||
NON_CHARACTER_BASE = 0xFFFE,
|
||||
NON_CHARACTER_END = 0xFFFF;
|
||||
|
||||
// useful statics and tables for fast lookup
|
||||
|
||||
/**
|
||||
* Values for surrogate detection. X is a surrogate iff X & SURROGATE_MASK == SURROGATE_MASK.
|
||||
*/
|
||||
static final int SURROGATE_MASK = 0xD800;
|
||||
|
||||
/**
|
||||
* Bottom 10 bits for use in surrogates.
|
||||
*/
|
||||
private static final int TRAIL_MASK = 0x3FF;
|
||||
|
||||
/**
|
||||
* Shift value for surrogates.
|
||||
*/
|
||||
private static final int SURROGATE_SHIFT = 10;
|
||||
|
||||
/**
|
||||
* Lead surrogates go from LEAD_BASE up to LEAD_LIMIT-1.
|
||||
*/
|
||||
private static final int LEAD_BASE = 0xD800, LEAD_LIMIT = 0xDC00;
|
||||
|
||||
/**
|
||||
* Trail surrogates go from TRAIL_BASE up to TRAIL_LIMIT-1.
|
||||
*/
|
||||
private static final int TRAIL_BASE = 0xDC00, TRAIL_LIMIT = 0xE000;
|
||||
|
||||
/**
|
||||
* Surrogates go from SURROGATE_BASE up to SURROGATE_LIMIT-1.
|
||||
*/
|
||||
private static final int SURROGATE_BASE = 0xD800, SURROGATE_LIMIT = 0xE000;
|
||||
|
||||
/**
|
||||
* Any codepoint at or greater than SURROGATE_SPACE_BASE requires 2 16-bit code units.
|
||||
*/
|
||||
//private static final int SURROGATE_SPACE_BASE = 0x10000;
|
||||
|
||||
/**
|
||||
* Offset to add to combined surrogate pair to avoid masking.
|
||||
*/
|
||||
private static final int SURROGATE_OFFSET = MIN_SUPPLEMENTARY
|
||||
- (LEAD_BASE << SURROGATE_SHIFT) - TRAIL_BASE;
|
||||
|
||||
private static final int LEAD_BASE_OFFSET = LEAD_BASE - (MIN_SUPPLEMENTARY >> SURROGATE_SHIFT);
|
||||
|
||||
};
|
177
tools/unicodetools/com/ibm/text/utility/UTF8StreamReader.java
Normal file
177
tools/unicodetools/com/ibm/text/utility/UTF8StreamReader.java
Normal file
|
@ -0,0 +1,177 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Utility class that writes UTF8.<br>
|
||||
* Main purpose is to supplant OutputStreamWriter(x, "UTF8"), since that has serious errors.
|
||||
* <br>
|
||||
* Example of Usage:
|
||||
* <pre>
|
||||
* PrintWriter log = new PrintWriter(
|
||||
* new UTF8StreamWriter(new FileOutputStream(fileName), 32*1024));
|
||||
* </pre>
|
||||
* NB: unsynchronized for simplicity and speed. The same object must NOT be used in multiple threads.
|
||||
*/
|
||||
// TODO: Fix case of surrogate pair crossing input buffer boundary
|
||||
|
||||
public final class UTF8StreamReader extends Reader {
|
||||
|
||||
private InputStream input;
|
||||
private boolean checkIrregular = true;
|
||||
|
||||
UTF8StreamReader(InputStream stream, int buffersize) {
|
||||
if (buffersize < 1) {
|
||||
throw new IllegalArgumentException("UTF8StreamReader buffersize must be >= 1");
|
||||
}
|
||||
input = stream;
|
||||
bBuffer = new byte[buffersize];
|
||||
}
|
||||
|
||||
private static final int MAGIC = 0x10000 + ((0 - 0xD800) << 10) + (0 - 0xDC00);
|
||||
|
||||
private byte[] bBuffer; // do a bit of buffering ourselves for efficiency
|
||||
private int
|
||||
bIndex = 0,
|
||||
bEnd = 0,
|
||||
bRemaining = 0,
|
||||
currentPoint = 0,
|
||||
lastPoint,
|
||||
shortestFormTest = 0;
|
||||
private char cCarry = 0;
|
||||
|
||||
private static final byte[] BYTES_REMAINING = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0-
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1-
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2-
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3-
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4-
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 5-
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6-
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 7-
|
||||
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, // 8-
|
||||
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, // 9-
|
||||
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, // A-
|
||||
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, // B-
|
||||
-1,-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C-
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D-
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // E-
|
||||
3, 3, 3, 3, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1 // F-
|
||||
};
|
||||
|
||||
public int read(char cbuf[], int off, int len) throws IOException {
|
||||
|
||||
// check input arguments
|
||||
|
||||
if (len <= 0) return 0;
|
||||
if (off > len) return 0;
|
||||
|
||||
int cIndex = off;
|
||||
int cEnd = off + len;
|
||||
|
||||
// if we had a low surrogate from the last call, get it first
|
||||
|
||||
if (cCarry != 0 && len > 0) {
|
||||
cbuf[cIndex++] = cCarry;
|
||||
cCarry = 0;
|
||||
}
|
||||
|
||||
// now loop, filling in the output
|
||||
|
||||
while (cIndex < cEnd) {
|
||||
|
||||
// get more bytes if we run out
|
||||
|
||||
if (bIndex >= bEnd) {
|
||||
bIndex = 0;
|
||||
bEnd = input.read(bBuffer, 0, bBuffer.length);
|
||||
if (bEnd < 0) {
|
||||
if (cIndex == off) return -1;
|
||||
return cIndex - off;
|
||||
}
|
||||
}
|
||||
|
||||
// process the current byte (mask because Java doesn't have unsigned byte)
|
||||
|
||||
int b = bBuffer[bIndex++] & 0xFF;
|
||||
|
||||
switch (bRemaining) {
|
||||
// First Byte case
|
||||
case 0:
|
||||
bRemaining = BYTES_REMAINING[b];
|
||||
switch (bRemaining) {
|
||||
case 0:
|
||||
cbuf[cIndex++] = (char) (lastPoint = b);
|
||||
break;
|
||||
case 1:
|
||||
currentPoint = b & 0x1F;
|
||||
shortestFormTest = 0x80;
|
||||
break;
|
||||
case 2:
|
||||
currentPoint = b & 0xF;
|
||||
shortestFormTest = 0x800;
|
||||
break;
|
||||
case 3:
|
||||
currentPoint = b & 0x7;
|
||||
shortestFormTest = 0x10000;
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("illegal lead code unit: " + b);
|
||||
}
|
||||
break;
|
||||
|
||||
// Trailing bytes
|
||||
case 2: case 3:
|
||||
b ^= 0x80;
|
||||
if (b > 0x3F) {
|
||||
throw new IllegalArgumentException("illegal trail code unit: " + (b ^ 0x80));
|
||||
}
|
||||
currentPoint = (currentPoint << 6) | b;
|
||||
--bRemaining;
|
||||
break;
|
||||
|
||||
// Last trailing byte, time to assemble
|
||||
case 1:
|
||||
b ^= 0x80;
|
||||
if (b > 0x3F) {
|
||||
throw new IllegalArgumentException("illegal trail code unit: " + (b ^ 0x80));
|
||||
}
|
||||
currentPoint = (currentPoint << 6) | b;
|
||||
--bRemaining;
|
||||
|
||||
// we have gotten the code, so check and stash it
|
||||
|
||||
if (currentPoint < shortestFormTest) {
|
||||
throw new IllegalArgumentException("illegal sequence, not shortest form: " + currentPoint);
|
||||
}
|
||||
if (checkIrregular && 0xD800 <= lastPoint && lastPoint <= 0xDC00
|
||||
&& 0xDC00 <= currentPoint && currentPoint <= 0xDFFF) {
|
||||
throw new IllegalArgumentException("irregular sequence, surrogate pair: " + currentPoint);
|
||||
}
|
||||
lastPoint = currentPoint;
|
||||
if (currentPoint >= 0x10000) {
|
||||
if (currentPoint > 0x10FFFF) {
|
||||
throw new IllegalArgumentException("illegal code point, too large: " + currentPoint);
|
||||
}
|
||||
currentPoint -= 0x10000;
|
||||
cbuf[cIndex++] = (char)(0xD800 + (currentPoint >> 10));
|
||||
currentPoint = 0xDC00 + (currentPoint & 0x3FF);
|
||||
if (cIndex >= cEnd) {
|
||||
cCarry = (char)currentPoint;
|
||||
return cIndex - off;
|
||||
}
|
||||
}
|
||||
cbuf[cIndex++] = (char)currentPoint;
|
||||
currentPoint = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return cIndex - off;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
}
|
||||
}
|
147
tools/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java
Normal file
147
tools/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java
Normal file
|
@ -0,0 +1,147 @@
|
|||
package com.ibm.text.utility;
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* Utility class that writes UTF8.<br>
|
||||
* Main purpose is to supplant OutputStreamWriter(x, "UTF8"), since that has serious errors.
|
||||
* <br>
|
||||
* Example of Usage:
|
||||
* <pre>
|
||||
* PrintWriter log = new PrintWriter(
|
||||
* new UTF8StreamWriter(new FileOutputStream(fileName), 32*1024));
|
||||
* </pre>
|
||||
* NB: unsynchronized for simplicity and speed. The same object must NOT be used in multiple threads.
|
||||
*/
|
||||
// TODO: Fix case of surrogate pair crossing input buffer boundary
|
||||
|
||||
public final class UTF8StreamWriter extends Writer {
|
||||
|
||||
private OutputStream output;
|
||||
private byte[] bBuffer; // do a bit of buffering ourselves for efficiency
|
||||
private int bSafeEnd;
|
||||
private int bEnd;
|
||||
private int bIndex = 0;
|
||||
private int highSurrogate = 0;
|
||||
|
||||
public UTF8StreamWriter(OutputStream stream, int buffersize) {
|
||||
if (buffersize < 5) {
|
||||
throw new IllegalArgumentException("UTF8StreamWriter buffersize must be >= 5");
|
||||
}
|
||||
output = stream;
|
||||
bBuffer = new byte[buffersize];
|
||||
bEnd = buffersize;
|
||||
bSafeEnd = buffersize - 4;
|
||||
}
|
||||
|
||||
private static final int
|
||||
NEED_2_BYTES = 1<<7,
|
||||
NEED_3_BYTES = 1<<(2*5 + 1),
|
||||
NEED_4_BYTES = 1<<(3*5 + 1);
|
||||
|
||||
private static final int
|
||||
TRAILING_BOTTOM_MASK = 0x3F,
|
||||
TRAILING_TOP = 0x80;
|
||||
|
||||
private static final int MAGIC = 0x10000 + ((0 - 0xD800) << 10) + (0 - 0xDC00);
|
||||
|
||||
public final void write(char[] buffer, int cStart, int cLength) throws IOException {
|
||||
int cEnd = cStart + cLength;
|
||||
while (cStart < cEnd) {
|
||||
|
||||
// write if we need to
|
||||
|
||||
if (bIndex > bSafeEnd) {
|
||||
output.write(bBuffer, 0, bIndex);
|
||||
bIndex = 0;
|
||||
}
|
||||
|
||||
// get code point
|
||||
|
||||
int utf32 = buffer[cStart++];
|
||||
|
||||
// special check for surrogates
|
||||
|
||||
if (highSurrogate != 0) {
|
||||
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
|
||||
writeCodePoint((highSurrogate << 10) + utf32 + MAGIC);
|
||||
highSurrogate = 0;
|
||||
continue;
|
||||
}
|
||||
writeCodePoint(highSurrogate);
|
||||
highSurrogate = 0;
|
||||
}
|
||||
|
||||
if (0xD800 <= utf32 && utf32 <= 0xDBFF) {
|
||||
highSurrogate = utf32;
|
||||
continue;
|
||||
}
|
||||
|
||||
// normal case
|
||||
|
||||
writeCodePoint(utf32);
|
||||
}
|
||||
}
|
||||
|
||||
private final void writeCodePoint(int utf32) {
|
||||
|
||||
// convert to bytes
|
||||
|
||||
if (utf32 < NEED_2_BYTES) {
|
||||
bBuffer[bIndex++] = (byte)utf32;
|
||||
return;
|
||||
}
|
||||
|
||||
// Find out how many bytes we need to write
|
||||
// At this point, it is at least 2.
|
||||
|
||||
//int count;
|
||||
int backIndex;
|
||||
int firstByteMark;
|
||||
if (utf32 < NEED_3_BYTES) {
|
||||
backIndex = bIndex += 2;
|
||||
firstByteMark = 0xC0;
|
||||
} else if (utf32 < NEED_4_BYTES) {
|
||||
backIndex = bIndex += 3;
|
||||
firstByteMark = 0xE0;
|
||||
bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK));
|
||||
utf32 >>= 6;
|
||||
} else {
|
||||
backIndex = bIndex += 4;
|
||||
firstByteMark = 0xF0;
|
||||
bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK));
|
||||
utf32 >>= 6;
|
||||
bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK));
|
||||
utf32 >>= 6;
|
||||
};
|
||||
bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK));
|
||||
utf32 >>= 6;
|
||||
bBuffer[--backIndex] = (byte)(firstByteMark | utf32);
|
||||
}
|
||||
|
||||
private void internalFlush() throws IOException {
|
||||
if (highSurrogate != 0) {
|
||||
if (bIndex > bEnd) {
|
||||
output.write(bBuffer, 0, bIndex);
|
||||
bIndex = 0;
|
||||
}
|
||||
writeCodePoint(highSurrogate);
|
||||
highSurrogate = 0;
|
||||
}
|
||||
|
||||
// write buffer if we need to
|
||||
if (bIndex != 0) {
|
||||
output.write(bBuffer, 0, bIndex);
|
||||
bIndex = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
internalFlush();
|
||||
output.close();
|
||||
}
|
||||
|
||||
public void flush() throws IOException {
|
||||
internalFlush();
|
||||
output.flush();
|
||||
}
|
||||
}
|
443
tools/unicodetools/com/ibm/text/utility/Utility.java
Normal file
443
tools/unicodetools/com/ibm/text/utility/Utility.java
Normal file
|
@ -0,0 +1,443 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
import java.util.*;
|
||||
import java.text.*;
|
||||
import java.io.*;
|
||||
|
||||
public final class Utility { // COMMON UTILITIES
|
||||
|
||||
static final boolean UTF8 = true; // TODO -- make argument
|
||||
|
||||
public static String getName(int i, String[] names) {
|
||||
try {
|
||||
return names[i];
|
||||
} catch (Exception e) {
|
||||
return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean needCRLF = false;
|
||||
|
||||
public static void dot(int i) {
|
||||
if ((i % 0x7FF) == 0) {
|
||||
needCRLF = true;
|
||||
System.out.print('.');
|
||||
}
|
||||
}
|
||||
|
||||
public static void fixDot() {
|
||||
if (needCRLF) {
|
||||
System.out.println();
|
||||
needCRLF = false;
|
||||
}
|
||||
}
|
||||
|
||||
public static int setBits(int source, int start, int end) {
|
||||
if (start < end) {
|
||||
int temp = start;
|
||||
start = end;
|
||||
end = temp;
|
||||
}
|
||||
int bmstart = (1 << (start+1)) - 1;
|
||||
int bmend = (1 << end) - 1;
|
||||
bmstart &= ~bmend;
|
||||
return source |= bmstart;
|
||||
}
|
||||
|
||||
public static int setBit(int source, int start) {
|
||||
return setBits(source, start, start);
|
||||
}
|
||||
|
||||
public static int clearBits(int source, int start, int end) {
|
||||
if (start < end) {
|
||||
int temp = start;
|
||||
start = end;
|
||||
end = temp;
|
||||
}
|
||||
int bmstart = (1 << (start+1)) - 1;
|
||||
int bmend = (1 << end) - 1;
|
||||
bmstart &= ~bmend;
|
||||
return source &= ~bmstart;
|
||||
}
|
||||
|
||||
public static int clearBit(int source, int start) {
|
||||
return clearBits(source, start, start);
|
||||
}
|
||||
|
||||
public static int find(String source, String[] target) {
|
||||
for (int i = 0; i < target.length; ++i) {
|
||||
if (source.equalsIgnoreCase(target[i])) return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public static byte lookup(String source, String[] target) {
|
||||
int result = Utility.find(source, target);
|
||||
if (result != -1) return (byte)result;
|
||||
throw new ChainException("Could not find \"{0}\" in table [{1}]", new Object [] {source, target});
|
||||
}
|
||||
|
||||
/**
|
||||
* Supplies a zero-padded hex representation of an integer (without 0x)
|
||||
*/
|
||||
static public String hex(long i, int places) {
|
||||
if (i == Long.MIN_VALUE) return "-8000000000000000";
|
||||
boolean negative = i < 0;
|
||||
if (negative) {
|
||||
i = -i;
|
||||
}
|
||||
String result = Long.toString(i, 16).toUpperCase();
|
||||
if (result.length() < places) {
|
||||
result = "0000000000000000".substring(result.length(),places) + result;
|
||||
}
|
||||
if (negative) {
|
||||
return '-' + result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static String hex(long ch) {
|
||||
return hex(ch,4);
|
||||
}
|
||||
|
||||
public static String hex(Object s) {
|
||||
return hex(s, 4, " ");
|
||||
}
|
||||
|
||||
public static String hex(Object s, int places) {
|
||||
return hex(s, places, " ");
|
||||
}
|
||||
|
||||
public static String hex(Object s, String separator) {
|
||||
return hex(s, 4, separator);
|
||||
}
|
||||
|
||||
public static String hex(Object o, int places, String separator) {
|
||||
if (o == null) return "";
|
||||
if (o instanceof Number) return hex(((Number)o).longValue(), places);
|
||||
|
||||
String s = o.toString();
|
||||
StringBuffer result = new StringBuffer();
|
||||
int ch;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(ch)) {
|
||||
if (i != 0) result.append(separator);
|
||||
ch = UTF32.char32At(s, i);
|
||||
result.append(hex(ch));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static String hex(byte[] o, int start, int end) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
//int ch;
|
||||
for (int i = start; i < end; ++i) {
|
||||
if (i != 0) result.append(' ');
|
||||
result.append(hex(o[i] & 0xFF, 2));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static String hex(char[] o, int start, int end) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = start; i < end; ++i) {
|
||||
if (i != 0) result.append(' ');
|
||||
result.append(hex(o[i], 4));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static String repeat(String s, int count) {
|
||||
if (count <= 0) return "";
|
||||
if (count == 1) return s;
|
||||
StringBuffer result = new StringBuffer(count*s.length());
|
||||
for (int i = 0; i < count; ++i) {
|
||||
result.append(s);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static int intFrom(String p) {
|
||||
if (p.length() == 0) return Short.MIN_VALUE;
|
||||
return Integer.parseInt(p);
|
||||
}
|
||||
|
||||
public static float floatFrom(String p) {
|
||||
if (p.length() == 0) return Float.NaN;
|
||||
int fract = p.indexOf('/');
|
||||
if (fract == -1) return Float.valueOf(p).floatValue();
|
||||
String q = p.substring(0,fract);
|
||||
float num = 0;
|
||||
if (q.length() != 0) num = Integer.parseInt(q);
|
||||
p = p.substring(fract+1,p.length());
|
||||
float den = 0;
|
||||
if (p.length() != 0) den = Integer.parseInt(p);
|
||||
return num/den;
|
||||
}
|
||||
|
||||
public static int codePointFromHex(String p) {
|
||||
String temp = Utility.fromHex(p);
|
||||
if (UTF32.length32(temp) != 1) throw new ChainException("String is not single (UTF32) character: " + p, null);
|
||||
return UTF32.char32At(temp, 0);
|
||||
}
|
||||
|
||||
public static String fromHex(String p) {
|
||||
StringBuffer output = new StringBuffer();
|
||||
int value = 0;
|
||||
int count = 0;
|
||||
main:
|
||||
for (int i = 0; i < p.length(); ++i) {
|
||||
char ch = p.charAt(i);
|
||||
int digit = 0;
|
||||
switch (ch) {
|
||||
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
|
||||
digit = ch - 'a' + 10;
|
||||
break;
|
||||
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
|
||||
digit = ch - 'A' + 10;
|
||||
break;
|
||||
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
|
||||
case '8': case '9':
|
||||
digit = ch - '0';
|
||||
break;
|
||||
default:
|
||||
int type = Character.getType(ch);
|
||||
if (type != Character.SPACE_SEPARATOR) {
|
||||
throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"",
|
||||
new Object[] {String.valueOf(ch), new Integer(i), p});
|
||||
}
|
||||
// fall through!!
|
||||
case ' ': case ',': case ';': // do SPACE here, just for speed
|
||||
if (count != 0) {
|
||||
UTF32.append32(output, value);
|
||||
}
|
||||
count = 0;
|
||||
value = 0;
|
||||
continue main;
|
||||
}
|
||||
value <<= 4;
|
||||
value += digit;
|
||||
if (value > 0x10FFFF) {
|
||||
throw new ChainException("Character code too large: '{0}' at position {1} in \"{2}\"",
|
||||
new Object[] {String.valueOf(ch), new Integer(i), p});
|
||||
}
|
||||
count++;
|
||||
}
|
||||
if (count != 0) {
|
||||
UTF32.append32(output, value);
|
||||
}
|
||||
return output.toString();
|
||||
}
|
||||
|
||||
public static int split(String s, char divider, String[] output) {
|
||||
int last = 0;
|
||||
int current = 0;
|
||||
int i;
|
||||
for (i = 0; i < s.length(); ++i) {
|
||||
if (s.charAt(i) == divider) {
|
||||
output[current++] = s.substring(last,i);
|
||||
last = i+1;
|
||||
}
|
||||
}
|
||||
output[current++] = s.substring(last,i);
|
||||
int result = current;
|
||||
while (current < output.length) {
|
||||
output[current++] = "";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static String[] split(String s, char divider) {
|
||||
String[] result = new String[100];
|
||||
int count = split(s, divider, result);
|
||||
return extract(result, 0, count);
|
||||
}
|
||||
|
||||
public static String[] extract(String[] source, int start, int end) {
|
||||
String[] result = new String[end-start];
|
||||
System.arraycopy(source, start, result, 0, end - start);
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
public static String quoteJava(String s) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
result.append(quoteJava(s.charAt(i)));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
*/
|
||||
public static String quoteJavaString(String s) {
|
||||
if (s == null) return "null";
|
||||
StringBuffer result = new StringBuffer();
|
||||
result.append('"');
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
result.append(quoteJava(s.charAt(i)));
|
||||
}
|
||||
result.append('"');
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static String quoteJava(int c) {
|
||||
switch (c) {
|
||||
case '\\':
|
||||
return "\\\\";
|
||||
case '"':
|
||||
return "\\\"";
|
||||
case '\r':
|
||||
return "\\r";
|
||||
case '\n':
|
||||
return "\\n";
|
||||
default:
|
||||
if (c >= 0x20 && c <= 0x7E) {
|
||||
return String.valueOf((char)c);
|
||||
} else if (UTF32.isSupplementary(c)) {
|
||||
return "\\u" + hex((char)UTF32.getLead(c),4) + "\\u" + hex((char)UTF32.getTrail(c),4);
|
||||
} else {
|
||||
return "\\u" + hex((char)c,4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static String quoteXML(int c) {
|
||||
switch (c) {
|
||||
case '<': return "<";
|
||||
case '>': return ">";
|
||||
case '&': return "&";
|
||||
case '\'': return "'";
|
||||
case '"': return """;
|
||||
|
||||
// fix controls, since XML can't handle
|
||||
|
||||
// also do this for 09, 0A, and 0D, so we can see them.
|
||||
case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x06: case 0x07:
|
||||
case 0x08: case 0x09: case 0x0A: case 0x0B: case 0x0C: case 0x0D: case 0x0E: case 0x0F:
|
||||
case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17:
|
||||
case 0x18: case 0x19: case 0x1A: case 0x1B: case 0x1C: case 0x1D: case 0x1E: case 0x1F:
|
||||
case 0x7F:
|
||||
|
||||
// fix noncharacters, since XML can't handle
|
||||
case 0xFFFE: case 0xFFFF:
|
||||
|
||||
return "#x" + hex(c,1) + ";";
|
||||
}
|
||||
|
||||
// fix surrogates, since XML can't handle
|
||||
if (UTF32.isSurrogate(c)) {
|
||||
return "#x" + hex(c,1) + ";";
|
||||
}
|
||||
|
||||
if (c <= 0x7E || UTF8) {
|
||||
return UTF32.valueOf32(c);
|
||||
}
|
||||
|
||||
// fix supplementaries & high characters, because of IE bug
|
||||
/*if (UTF32.isSupplementary(c) || 0xFFF9 <= c && c <= 0xFFFD) {
|
||||
return "#x" + hex(c,1) + ";";
|
||||
}
|
||||
*/
|
||||
|
||||
return "&#x" + hex(c,1) + ";";
|
||||
}
|
||||
|
||||
public static String quoteXML(String source) {
|
||||
if (source == null) return "null";
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
int c = UTF32.char32At(source, i);
|
||||
if (UTF32.isSupplementary(c)) ++i;
|
||||
result.append(quoteXML(c));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static int compare(char[] a, int aStart, int aEnd, char[] b, int bStart, int bEnd) {
|
||||
while (aStart < aEnd && bStart < bEnd) {
|
||||
int diff = a[aStart++] - b[bStart++];
|
||||
if (diff != 0) return diff;
|
||||
}
|
||||
return (aEnd - aStart) - (bEnd - bStart);
|
||||
}
|
||||
|
||||
public static int compare(byte[] a, int aStart, int aEnd, byte[] b, int bStart, int bEnd) {
|
||||
while (aStart < aEnd && bStart < bEnd) {
|
||||
int diff = a[aStart++] - b[bStart++];
|
||||
if (diff != 0) return diff;
|
||||
}
|
||||
return (aEnd - aStart) - (bEnd - bStart);
|
||||
}
|
||||
|
||||
public static int compareUnsigned(byte[] a, int aStart, int aEnd, byte[] b, int bStart, int bEnd) {
|
||||
while (aStart < aEnd && bStart < bEnd) {
|
||||
int diff = (a[aStart++] & 0xFF) - (b[bStart++] & 0xFF);
|
||||
if (diff != 0) return diff;
|
||||
}
|
||||
return (aEnd - aStart) - (bEnd - bStart);
|
||||
}
|
||||
|
||||
public static String join(int[] array, String sep) {
|
||||
String result = "{";
|
||||
for (int i = 0; i < array.length; ++i) {
|
||||
if (i != 0) result += sep;
|
||||
result += array[i];
|
||||
}
|
||||
return result + "}";
|
||||
}
|
||||
|
||||
public static String join(long[] array, String sep) {
|
||||
String result = "{";
|
||||
for (int i = 0; i < array.length; ++i) {
|
||||
if (i != 0) result += sep;
|
||||
result += array[i];
|
||||
}
|
||||
return result + "}";
|
||||
}
|
||||
|
||||
private static final String[] searchPath = {
|
||||
"EXTRAS",
|
||||
"3.1.1",
|
||||
"3.1.0",
|
||||
"3.0.1",
|
||||
"3.0.0",
|
||||
"2.1.9",
|
||||
"2.0.0",
|
||||
"1.1.0",
|
||||
};
|
||||
|
||||
private static final String DATA_DIR = "C:\\DATA";
|
||||
|
||||
public static PrintWriter openPrintWriter(String filename) throws IOException {
|
||||
return new PrintWriter(
|
||||
new UTF8StreamWriter(new FileOutputStream(DATA_DIR + File.separator + "GEN" + File.separator + filename),
|
||||
32*1024));
|
||||
}
|
||||
|
||||
public static BufferedReader openUnicodeFile(String filename, String version) throws IOException {
|
||||
// get all the files in the directory
|
||||
|
||||
for (int i = 0; i < searchPath.length; ++i) {
|
||||
if (version.length() != 0 && version.compareTo(searchPath[i]) < 0) continue;
|
||||
|
||||
String directoryName = DATA_DIR + File.separator + searchPath[i] + "-Update" + File.separator;
|
||||
System.out.println("Trying: '" + directoryName + "'");
|
||||
File directory = new File(directoryName);
|
||||
String[] list = directory.list();
|
||||
for (int j = 0; j < list.length; ++j) {
|
||||
String fn = list[j];
|
||||
if (!fn.endsWith(".txt")) continue;
|
||||
//System.out.print("\t'" + fn + "'");
|
||||
if (!fn.startsWith(filename)) {
|
||||
//System.out.println(" -- MISS: '" + filename + "'");
|
||||
continue;
|
||||
}
|
||||
//System.out.println(" -- HIT");
|
||||
System.out.println("\tFound: '" + fn + "'");
|
||||
return new BufferedReader(new FileReader(directoryName + fn),32*1024);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
403
tools/unicodetools/com/ibm/text/utility/XMLParse.java
Normal file
403
tools/unicodetools/com/ibm/text/utility/XMLParse.java
Normal file
|
@ -0,0 +1,403 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
/**
|
||||
* Very dumb XML parser, designed for restricted environment where transmitter is guaranteed
|
||||
* to limit types of XML files generated.
|
||||
*
|
||||
* RESTRICTIONS
|
||||
* Requires document to be well-formed. Doesn't properly signal errors if it is not.
|
||||
* No DTDs, !DOCTYPE, !ATTLIST, !ELEMENT, ![, !NOTATION, !ENTITY, CDATA
|
||||
* No processing instructions
|
||||
* Does do character references, lt, gt, amp, apos, quot
|
||||
* The encoding is specified by the user, by using the right Reader
|
||||
* On creation, you supply a buffer for the textual elements. Use a buffer that is as large
|
||||
* as the largest possible piece of text (e.g. attribute value or element text) in the file.
|
||||
*
|
||||
* @author Mark Davis
|
||||
*/
|
||||
import java.io.*;
|
||||
|
||||
public final class XMLParse implements XMLParseTypes {
|
||||
|
||||
/** Create a parser.
|
||||
*/
|
||||
public XMLParse(Reader stream, char[] buffer) {
|
||||
this.stream = stream;
|
||||
this.buffer = buffer;
|
||||
}
|
||||
|
||||
/** Create a parser.
|
||||
*/
|
||||
public XMLParse(String fileName, char[] buffer) throws FileNotFoundException {
|
||||
stream = new BufferedReader(new FileReader(fileName),32*1024);
|
||||
this.buffer = buffer;
|
||||
}
|
||||
|
||||
/** Get the textual value associated with this item.
|
||||
* Only valid for ELEMENT_TAG*, ATTRIBUTE*, TEXT.
|
||||
*/
|
||||
public String getValue() {
|
||||
return String.valueOf(buffer, 0, bufferCount);
|
||||
}
|
||||
|
||||
/** Get length of the textual value associated with this item.
|
||||
* Only valid for ELEMENT_TAG*, ATTRIBUTE*, TEXT.
|
||||
*/
|
||||
public int getValueCount() {
|
||||
return bufferCount;
|
||||
}
|
||||
|
||||
/** Get the buffer that was passed in on creation.
|
||||
*/
|
||||
public char[] getValueArray() {
|
||||
return buffer;
|
||||
}
|
||||
|
||||
/** Get the "kind" of the last item (see XMLParseTypes)
|
||||
*/
|
||||
public int getKind() {
|
||||
return kind;
|
||||
}
|
||||
|
||||
/** Get the next element, returning a "Kind" (see XMLParseTypes)
|
||||
*/
|
||||
|
||||
public byte next() {
|
||||
|
||||
char c = '\u0000';
|
||||
char type = c;
|
||||
|
||||
while (c != 0xFFFF) {
|
||||
try {
|
||||
|
||||
// First read the character. If there is a buffered char, use it instead
|
||||
|
||||
if (bufferChar != 0) {
|
||||
c = bufferChar;
|
||||
bufferChar = 0;
|
||||
} else {
|
||||
c = (char) stream.read();
|
||||
}
|
||||
|
||||
// Now set the right type. Since we assume validity, anything but the syntax chars
|
||||
// can be classed as IDENTIFIER
|
||||
|
||||
switch (c) {
|
||||
case ' ': case '\r': case '\n': case '\t':
|
||||
type = ' ';
|
||||
break;
|
||||
case '<': case '>': case '#': case ';': case '/': case '\'': case '"':
|
||||
case '=': case '?': case '!': case '-':
|
||||
type = c;
|
||||
break;
|
||||
case '&': // CR, either numerical or lt, gt, quot, amp, apos
|
||||
|
||||
// gather characters
|
||||
|
||||
int crCount = 0;
|
||||
while (true) {
|
||||
c = (char) stream.read();
|
||||
if (c == ';') break;
|
||||
crBuffer[crCount++] = c;
|
||||
}
|
||||
|
||||
// parse it, and break into two pieces if necessary
|
||||
|
||||
int x = parseCR(crBuffer, crCount);
|
||||
c = (char)x;
|
||||
if (x > 0xFFFF) { // Supplementary
|
||||
x -= 0x10000;
|
||||
c = (char) (0xD800 + (x >> 10));
|
||||
bufferChar = (char) (0xDC00 + (x & 0x3FF));
|
||||
}
|
||||
|
||||
// Since we assume validity, any CRs are not syntax characters
|
||||
|
||||
type = IDENTIFIER; // everything else
|
||||
break;
|
||||
default:
|
||||
type = IDENTIFIER; // everything else
|
||||
break;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
c = '\uFFFF';
|
||||
}
|
||||
|
||||
// We now have a character. Throw it at our little state machine
|
||||
|
||||
if (SHOW) System.out.println(c + ", " + type + ", " + stateNames[state]);
|
||||
switch (state) {
|
||||
case IN_TEXT:
|
||||
if (type == '<') {
|
||||
state = START_ELEMENT;
|
||||
if (bufferCount != 0) {
|
||||
kind = TEXT;
|
||||
return kind;
|
||||
}
|
||||
break;
|
||||
}
|
||||
buffer[bufferCount++] = c;
|
||||
break;
|
||||
case START_ELEMENT: // must be either '/' or more than one ID char
|
||||
bufferCount = 0;
|
||||
switch (type) {
|
||||
case '/':
|
||||
elementType = ELEMENT_TAG_SLASH;
|
||||
state = IN_ELEMENT;
|
||||
break;
|
||||
case '!':
|
||||
buffer[bufferCount++] = c;
|
||||
elementType = ELEMENT_TAG_COMMENT;
|
||||
state = IN_COMMENT;
|
||||
break;
|
||||
case '?':
|
||||
elementType = ELEMENT_TAG_QUESTION;
|
||||
state = IN_ELEMENT;
|
||||
break;
|
||||
default:
|
||||
elementType = ELEMENT_TAG;
|
||||
buffer[bufferCount++] = c;
|
||||
state = IN_ELEMENT;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case IN_COMMENT:
|
||||
buffer[bufferCount++] = c;
|
||||
if (type == '-') state = IN_COMMENT2;
|
||||
else state = IN_COMMENT;
|
||||
break;
|
||||
case IN_COMMENT2:
|
||||
buffer[bufferCount++] = c;
|
||||
if (type == '-') state = IN_COMMENT3;
|
||||
else state = IN_COMMENT;
|
||||
break;
|
||||
case IN_COMMENT3:
|
||||
if (type == '>') {
|
||||
kind = ELEMENT_TAG_COMMENT;
|
||||
bufferChar = c;
|
||||
state = IN_ATTRIBUTES;
|
||||
elementType = END_ELEMENT_COMMENT;
|
||||
return kind;
|
||||
} else if (type != '-') {
|
||||
state = IN_COMMENT;
|
||||
}
|
||||
buffer[bufferCount++] = c;
|
||||
break;
|
||||
case IN_ELEMENT:
|
||||
if (type != IDENTIFIER) {
|
||||
state = IN_ATTRIBUTES;
|
||||
kind = elementType;
|
||||
elementType = END_ELEMENT;
|
||||
bufferChar = c;
|
||||
return kind;
|
||||
}
|
||||
buffer[bufferCount++] = c;
|
||||
break;
|
||||
case IN_ATTRIBUTES:
|
||||
bufferCount = 0;
|
||||
if (type == '/') {
|
||||
elementType = END_ELEMENT_SLASH;
|
||||
} else if (type == '?') {
|
||||
elementType = END_ELEMENT_QUESTION;
|
||||
} else if (type == '>') {
|
||||
state = IN_TEXT;
|
||||
kind = elementType;
|
||||
return kind;
|
||||
} else if (type == IDENTIFIER) {
|
||||
state = IN_ATTR;
|
||||
buffer[bufferCount++] = c;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case IN_ATTR:
|
||||
if (type != IDENTIFIER) {
|
||||
state = START_VALUE;
|
||||
kind = ATTRIBUTE_TAG;
|
||||
return kind;
|
||||
}
|
||||
buffer[bufferCount++] = c;
|
||||
break;
|
||||
case START_VALUE: // must have <s>* = ( ' | " )
|
||||
if (type == '\'' || type == '"') {
|
||||
lastQuote = c;
|
||||
state = IN_VALUE;
|
||||
bufferCount = 0;
|
||||
}
|
||||
break;
|
||||
case IN_VALUE: // only terminated by lastQuote
|
||||
if (type == lastQuote) {
|
||||
state = IN_ATTRIBUTES;
|
||||
kind = ATTRIBUTE_VALUE;
|
||||
return kind;
|
||||
}
|
||||
buffer[bufferCount++] = c;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/** Utility for doing XML quotes. Flags control which characters are handled and how.
|
||||
* (see XMLParseTypes for values)
|
||||
*/
|
||||
|
||||
public static String quote(int c) {
|
||||
return quote(c, 0);
|
||||
}
|
||||
|
||||
/** Utility for doing XML quotes. Flags control which characters are handled and how.
|
||||
* (see XMLParseTypes for values)
|
||||
*/
|
||||
|
||||
public static String quote(int c, int flags) {
|
||||
String result = quoteGuts(c, flags);
|
||||
if (result != null) return result;
|
||||
return String.valueOf((char)c);
|
||||
}
|
||||
|
||||
/** Utility for doing XML quotes. Flags control which characters are handled and how.
|
||||
* (see XMLParseTypes for values)
|
||||
*/
|
||||
|
||||
public static String quote(String source) {
|
||||
return quote(source, 0);
|
||||
}
|
||||
|
||||
/** Utility for doing XML quotes. Flags control which characters are handled and how.
|
||||
* (see XMLParseTypes for values)
|
||||
*/
|
||||
|
||||
public static String quote(String source, int flags) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
String temp;
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
int c = UTF32.char32At(source, i);
|
||||
if (c > 0xFFFF) ++i;
|
||||
temp = quoteGuts(c, flags);
|
||||
if (temp != null) result.append(temp);
|
||||
else if (c <= 0xFFFF) result.append((char)c);
|
||||
else result.append(source.substring(i-1,i+1)); // surrogates
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/** Parses inside of CR. buffer should not contain the initial '&', or final ';'
|
||||
*/
|
||||
static int parseCR(char[] crBuffer, int crCount) {
|
||||
int c;
|
||||
int start = 0;
|
||||
if (crCount == 0) return -1;
|
||||
switch (crBuffer[start++]) {
|
||||
case 'l': c = '<'; break; // lt
|
||||
case 'g': c = '>'; break; // gt
|
||||
case 'q': c = '"'; break; // quot
|
||||
case 'a': // &, '
|
||||
if (crCount > start && crBuffer[start] == 'm') c = '&';
|
||||
else c = '\'';
|
||||
break;
|
||||
case '#':
|
||||
int radix = 10;
|
||||
if (crCount > start && crBuffer[start] == 'x') {
|
||||
radix = 16;
|
||||
++start;
|
||||
}
|
||||
// Simple code for now. Could be sped up.
|
||||
c = Integer.parseInt(String.valueOf(crBuffer,start,crCount-start), radix);
|
||||
break;
|
||||
default:
|
||||
c = -1;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
/** Utility for doing hex, padding with zeros
|
||||
*/
|
||||
|
||||
static public String hex(long i, int places) {
|
||||
String result = Long.toString(i, 16).toUpperCase();
|
||||
if (result.length() < places) {
|
||||
result = "0000000000000000".substring(result.length(),places) + result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
// =================== PRIVATES =================================
|
||||
|
||||
private static final char[] buf2 = new char[2];
|
||||
|
||||
private static final boolean SHOW = false;
|
||||
|
||||
private char[] buffer;
|
||||
private int bufferCount;
|
||||
private byte kind = TEXT;
|
||||
|
||||
private Reader stream;
|
||||
private char[] crBuffer = new char[10];
|
||||
private int state = IN_TEXT;
|
||||
private byte elementType;
|
||||
private char lastQuote;
|
||||
private char bufferChar;
|
||||
|
||||
private static final byte IN_TEXT = 0, START_ELEMENT = 1, IN_ELEMENT = 2,
|
||||
IN_ATTR = 3, START_VALUE = 4, IN_VALUE = 5, IN_ATTRIBUTES = 6,
|
||||
IN_COMMENT = 7, IN_COMMENT2 = 8, IN_COMMENT3 = 9;
|
||||
|
||||
private static final String[] stateNames = {"IN_TEXT", "START_ELEMENT", "IN_ELEMENT",
|
||||
"IN_ATTR", "START_VALUE", "IN_VALUE", "IN_ATTRIBUTES",
|
||||
"IN_COMMENT", "IN_COMMENT2", "IN_COMMENT3"};
|
||||
|
||||
private static final char IDENTIFIER = 'a';
|
||||
|
||||
|
||||
private static String quoteGuts(int c, int flags) {
|
||||
String prefix = "&";
|
||||
switch (c) {
|
||||
case '<': return "<";
|
||||
case '>': return ">";
|
||||
case '&': return "&";
|
||||
case '\'': return "'";
|
||||
case '"': return """;
|
||||
|
||||
// Optionally fix TAB, CR, LF
|
||||
|
||||
case 0x09: case 0x0A: case 0x0D:
|
||||
if ((flags & QUOTE_TABCRLF) == 0) return null;
|
||||
break;
|
||||
|
||||
// Fix controls, non-characters, since XML can't handle
|
||||
|
||||
case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x06: case 0x07:
|
||||
case 0x08: case 0x0B: case 0x0C: case 0x0E: case 0x0F:
|
||||
case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17:
|
||||
case 0x18: case 0x19: case 0x1A: case 0x1B: case 0x1C: case 0x1D: case 0x1E: case 0x1F:
|
||||
case 0x7F:
|
||||
case 0xFFFE: case 0xFFFF:
|
||||
prefix = "";
|
||||
break;
|
||||
|
||||
// Optionally fix IE Bug characters
|
||||
|
||||
case 0xFF00: case 0xFF01: case 0xFF02: case 0xFF03: case 0xFF04: case 0xFF05: case 0xFF06: case 0xFF07:
|
||||
case 0xFFF8: case 0xFFF9: case 0xFFFA: case 0xFFFB: case 0xFFFC: case 0xFFFD:
|
||||
if ((flags & QUOTE_IEBUG) == 0) return null;
|
||||
prefix = "";
|
||||
break;
|
||||
|
||||
default:
|
||||
if (c <= 0x7E) { // don't quote other ASCII
|
||||
if ((flags & QUOTE_ASCII) == 0) return null;
|
||||
} else if (0xD800 <= c && c <= 0xDFFF) {// fix surrogates, since XML can't handle
|
||||
prefix = "";
|
||||
} else if (c > 0xFFFF && (flags & QUOTE_IEBUG) != 0) {
|
||||
prefix = "";
|
||||
} else if ((flags & QUOTE_NON_ASCII) == 0) {
|
||||
return null;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if ((flags & QUOTE_DECIMAL) == 0) {
|
||||
return prefix + "#x" + hex(c,1) + ";";
|
||||
} else {
|
||||
return prefix + "#" + Integer.toString(c) + ";";
|
||||
}
|
||||
}
|
||||
}
|
35
tools/unicodetools/com/ibm/text/utility/XMLParseTypes.java
Normal file
35
tools/unicodetools/com/ibm/text/utility/XMLParseTypes.java
Normal file
|
@ -0,0 +1,35 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
/** Interface of values for use with XMLParse.
|
||||
* Others classes can "implements" this also, to avoid typing XMLParseTypes.XXX
|
||||
*/
|
||||
public interface XMLParseTypes {
|
||||
|
||||
/** Kind values, for XMLParse.getKind(), next()
|
||||
*/
|
||||
public static final byte
|
||||
DONE = 0,
|
||||
ELEMENT_TAG = 1, ELEMENT_TAG_SLASH = 2, ELEMENT_TAG_COMMENT = 3, ELEMENT_TAG_QUESTION = 4,
|
||||
END_ELEMENT = 5, END_ELEMENT_SLASH = 6, END_ELEMENT_COMMENT = 7, END_ELEMENT_QUESTION = 8,
|
||||
ATTRIBUTE_TAG = 9, ATTRIBUTE_VALUE = 10,
|
||||
TEXT = 11;
|
||||
|
||||
/** Flag masks for XMLParse.quote(x, flags). Use '|' to combine
|
||||
*/
|
||||
public static final byte
|
||||
QUOTE_NON_ASCII = 1,
|
||||
QUOTE_ASCII = 2,
|
||||
QUOTE_IEBUG = 4,
|
||||
QUOTE_TABCRLF = 8,
|
||||
QUOTE_DECIMAL = 16;
|
||||
|
||||
/** For Debugging
|
||||
*/
|
||||
static final String[] kindNames = {
|
||||
"DONE",
|
||||
"ELEMENT_TAG", "ELEMENT_TAG_SLASH", "ELEMENT_TAG_COMMENT", "ELEMENT_TAG_QUESTION",
|
||||
"END_ELEMENT", "END_ELEMENT_SLASH", "END_ELEMENT_COMMENT", "END_ELEMENT_QUESTION",
|
||||
"ATTRIBUTE_TAG", "ATTRIBUTE_VALUE",
|
||||
"TEXT",
|
||||
};
|
||||
}
|
336
tools/unicodetools/com/ibm/text/utility/testParser.java
Normal file
336
tools/unicodetools/com/ibm/text/utility/testParser.java
Normal file
|
@ -0,0 +1,336 @@
|
|||
package com.ibm.text.utility;
|
||||
|
||||
/** Simple Test program for XMLParse
|
||||
*/
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
public class testParser implements XMLParseTypes {
|
||||
public static final String BASE_DIR = "C:\\Documents and Settings\\Davis\\My Documents\\UnicodeData\\UNIDATA 3.0.1\\";
|
||||
public static final boolean VERBOSE = false;
|
||||
|
||||
private static final String testFile = BASE_DIR + "UCD-Main.xml"; // "test.xml"; // BASE_DIR + "UCD-Main.xml";
|
||||
|
||||
public static void main (String[] args) throws Exception {
|
||||
//test1();
|
||||
//test2();
|
||||
test3();
|
||||
}
|
||||
|
||||
public static void test1() throws Exception {
|
||||
XMLParse xml = new XMLParse(testFile, new char[1000]);
|
||||
for (int i = 0; i < 100000; ++i) {
|
||||
byte kind = xml.next();
|
||||
if (kind == DONE) break;
|
||||
String value = xml.getValue();
|
||||
int quoteFlags = QUOTE_IEBUG | QUOTE_NON_ASCII | (kind != TEXT ? QUOTE_TABCRLF : 0);
|
||||
String qValue = XMLParse.quote(value, quoteFlags);
|
||||
if (VERBOSE) System.out.println(kindNames[kind] + ", \"" + value + "\", \"" + qValue + "\"");
|
||||
else {
|
||||
switch (kind) {
|
||||
case ELEMENT_TAG: System.out.print('<' + qValue); break;
|
||||
case ELEMENT_TAG_SLASH: System.out.print("</" + qValue); break;
|
||||
case ELEMENT_TAG_COMMENT: System.out.print("<" + qValue); break;
|
||||
case ELEMENT_TAG_QUESTION: System.out.print("<?" + qValue); break;
|
||||
|
||||
case END_ELEMENT: System.out.print(">"); break;
|
||||
case END_ELEMENT_COMMENT: System.out.print(">"); break;
|
||||
case END_ELEMENT_SLASH: System.out.print("/>"); break;
|
||||
case END_ELEMENT_QUESTION: System.out.print("?>"); break;
|
||||
|
||||
case ATTRIBUTE_TAG: System.out.print(" " + qValue + "="); break;
|
||||
case ATTRIBUTE_VALUE: System.out.print("\"" + qValue + "\""); break;
|
||||
|
||||
case TEXT: System.out.print(qValue); break;
|
||||
|
||||
default: throw new Exception("Unknown KIND");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static final int NORMAL_QUOTE = QUOTE_NON_ASCII | QUOTE_IEBUG | QUOTE_TABCRLF;
|
||||
|
||||
static void test2() throws Exception {
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter("UCD-Extract.html");
|
||||
|
||||
//int fieldCount = 4;
|
||||
//int width = 100/fieldCount;
|
||||
//int first = width + 100 - width*fieldCount;
|
||||
try {
|
||||
log.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
log.println("<style><!--");
|
||||
log.println("th { background-color: #99FFFF; text-align: Left; font-style: italic; font-weight: bold }");
|
||||
log.println("table { page-break-after: always }");
|
||||
log.println("--></style>");
|
||||
|
||||
log.println("<title>Extract from UCD</title>");
|
||||
log.println("</head><body>");
|
||||
|
||||
String tableHead = "<table border='1' width='100%' cellpadding='4'><tr>"
|
||||
+ "<th width='20'>Code</th>"
|
||||
+ "<th width='20'>Char</th>"
|
||||
+ "<th width='20'>GC</th>"
|
||||
+ "<th width='50%'>Props</th>"
|
||||
+ "<th width='50%'>Name</th></tr></tr>";
|
||||
log.println(tableHead);
|
||||
|
||||
XMLParse xml = new XMLParse(BASE_DIR + "UCD-Main.xml", new char[1000]);
|
||||
boolean recordingChar = false;
|
||||
int topByte = 0;
|
||||
int printByte = 0;
|
||||
Map data = new TreeMap();
|
||||
String lastTag = "";
|
||||
|
||||
for (int line = 0; ; ++line) {
|
||||
byte kind = xml.next();
|
||||
if (kind == DONE) break;
|
||||
String value = xml.getValue();
|
||||
switch (kind) {
|
||||
case ELEMENT_TAG:
|
||||
recordingChar = value.equals("e");
|
||||
break;
|
||||
|
||||
case ATTRIBUTE_TAG:
|
||||
if (!recordingChar) break;
|
||||
lastTag = value;
|
||||
break;
|
||||
|
||||
case ATTRIBUTE_VALUE:
|
||||
if (!recordingChar) break;
|
||||
data.put(lastTag, value);
|
||||
break;
|
||||
|
||||
case END_ELEMENT:
|
||||
case END_ELEMENT_SLASH:
|
||||
if (!recordingChar) break;
|
||||
recordingChar = false;
|
||||
|
||||
// get data
|
||||
|
||||
String ch = (String)data.get("c");
|
||||
ch = fixHack(ch);
|
||||
String name = (String)data.get("n");
|
||||
if (name == null) name = "<computed>";
|
||||
String props = (String)data.get("xs");
|
||||
if (props == null) props = "\u00A0";
|
||||
String gc = (String)data.get("gc");
|
||||
if (gc == null) gc = "Lo";
|
||||
|
||||
// split tables
|
||||
int code = UTF32.char32At(ch, 0);
|
||||
if ((topByte & ~0x1F) != (code & ~0x1F)) {
|
||||
log.println("</table><br>");
|
||||
log.println(tableHead);
|
||||
topByte = code;
|
||||
if ((printByte & ~0xFF) != (code & ~0xFF)) {
|
||||
System.out.println("Printing table for " + XMLParse.hex(topByte,2));
|
||||
printByte = code;
|
||||
}
|
||||
}
|
||||
|
||||
// draw line
|
||||
|
||||
log.println("<tr><td>" + XMLParse.hex(code,4) +
|
||||
"</td><td>" + XMLParse.quote(ch,NORMAL_QUOTE) +
|
||||
"</td><td>" + XMLParse.quote(gc,NORMAL_QUOTE) +
|
||||
"</td><td>" + XMLParse.quote(props,NORMAL_QUOTE) +
|
||||
"</td><td>" + XMLParse.quote(name,NORMAL_QUOTE) + "</td></tr>");
|
||||
|
||||
// clear storage
|
||||
data.clear();
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
log.println("</table></body></html>");
|
||||
} finally {
|
||||
log.close();
|
||||
}
|
||||
}
|
||||
|
||||
static void test3() throws Exception {
|
||||
PrintWriter log = new PrintWriter(new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(BASE_DIR + "CaseFoldingDraft3.txt"),
|
||||
"UTF8"),
|
||||
32*1024));
|
||||
|
||||
try {
|
||||
collect(log, "Other_Math");
|
||||
collect (log, "Other_Alphabetic");
|
||||
collect (log, "Other_Composite");
|
||||
//int fieldCount = 4;
|
||||
//int width = 100/fieldCount;
|
||||
//int first = width + 100 - width*fieldCount;
|
||||
} finally {
|
||||
log.close();
|
||||
}
|
||||
}
|
||||
|
||||
static final void collect(PrintWriter log, String prop) throws Exception {
|
||||
XMLParse xml = new XMLParse(BASE_DIR + "UCD-Main.xml", new char[1000]);
|
||||
//boolean recordingChar = false;
|
||||
//int topByte = 0;
|
||||
//int printByte = 0;
|
||||
//Map data = new TreeMap();
|
||||
String lastTag = "";
|
||||
String lastChar = "";
|
||||
String lastName = "";
|
||||
String lastCat = "";
|
||||
int startChar = -1;
|
||||
int endChar = -2;
|
||||
String startName = "";
|
||||
String startCat = "";
|
||||
|
||||
for (int line = 0; ; ++line) {
|
||||
if ((line % 10000) == 0) System.err.println("Item " + line);
|
||||
byte kind = xml.next();
|
||||
if (kind == DONE) break;
|
||||
String value = xml.getValue();
|
||||
switch (kind) {
|
||||
case ATTRIBUTE_TAG:
|
||||
lastTag = value;
|
||||
break;
|
||||
|
||||
case ATTRIBUTE_VALUE:
|
||||
if (lastTag.equals("c")) lastChar = value;
|
||||
else if (lastTag.equals("n")) lastName = value;
|
||||
else if (lastTag.equals("gc")) lastCat = value;
|
||||
else if (lastTag.equals("xs") && value.indexOf(prop) >= 0) {
|
||||
lastChar = fixHack(lastChar);
|
||||
int ch = UTF32.char32At(lastChar,0);
|
||||
if (ch == endChar + 1) endChar = ch;
|
||||
else {
|
||||
//FDD0; FDEF; Noncharacter_Code_Point; # XX; 32;
|
||||
if (endChar >= 0) log.println(Utility.hex(startChar, 4) + "; "
|
||||
+ (endChar == startChar ? " " : Utility.hex(endChar, 4))
|
||||
+ "; " + prop
|
||||
+ "; # " + startCat
|
||||
+ "; " + (endChar-startChar+1)
|
||||
+ "; " + startName
|
||||
+ (endChar == startChar ? "" : "..."));
|
||||
startChar = endChar = ch;
|
||||
startName = lastName;
|
||||
startCat = lastCat;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (endChar >= 0) log.println(Utility.hex(startChar, 4) + "; "
|
||||
+ (endChar == startChar ? " " : Utility.hex(endChar, 4))
|
||||
+ "; " + prop
|
||||
+ "; # " + startCat
|
||||
+ "; " + (endChar-startChar+1)
|
||||
+ "; " + startName
|
||||
+ (endChar == startChar ? "" : "..."));
|
||||
}
|
||||
|
||||
static void test4() throws Exception {
|
||||
PrintWriter log = new PrintWriter(new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(BASE_DIR + "CaseFoldingDraft3.txt"),
|
||||
"UTF8"),
|
||||
32*1024));
|
||||
|
||||
//int fieldCount = 4;
|
||||
//int width = 100/fieldCount;
|
||||
//int first = width + 100 - width*fieldCount;
|
||||
try {
|
||||
XMLParse xml = new XMLParse(BASE_DIR + "UCD-Main.xml", new char[1000]);
|
||||
boolean recordingChar = false;
|
||||
//int topByte = 0;
|
||||
//int printByte = 0;
|
||||
Map data = new TreeMap();
|
||||
String lastTag = "";
|
||||
|
||||
for (int line = 0; ; ++line) {
|
||||
if ((line % 10000) == 0) System.err.println("Item " + line);
|
||||
byte kind = xml.next();
|
||||
if (kind == DONE) break;
|
||||
String value = xml.getValue();
|
||||
switch (kind) {
|
||||
case ELEMENT_TAG:
|
||||
recordingChar = value.equals("e");
|
||||
break;
|
||||
|
||||
case ATTRIBUTE_TAG:
|
||||
if (!recordingChar) break;
|
||||
lastTag = value;
|
||||
break;
|
||||
|
||||
case ATTRIBUTE_VALUE:
|
||||
if (!recordingChar) break;
|
||||
data.put(lastTag, value);
|
||||
break;
|
||||
|
||||
case END_ELEMENT:
|
||||
case END_ELEMENT_SLASH:
|
||||
if (!recordingChar) break;
|
||||
recordingChar = false;
|
||||
|
||||
// get data
|
||||
|
||||
String ch = (String)data.get("c");
|
||||
ch = fixHack(ch);
|
||||
|
||||
String name = (String)data.get("n");
|
||||
if (name == null) name = "<computed>";
|
||||
|
||||
String lc = (String)data.get("lc");
|
||||
if (lc == null) lc = ch;
|
||||
|
||||
String fc = (String)data.get("fc");
|
||||
if (fc == null) fc = (String)data.get("sl");
|
||||
if (fc == null) fc = lc;
|
||||
|
||||
if (fc.equals(ch)) continue;
|
||||
|
||||
if (fc.length() == 1) {
|
||||
log.println(Utility.hex(ch, " ") + "; C; " + Utility.hex(fc, " ") + "; # " + name);
|
||||
} else {
|
||||
log.println(Utility.hex(ch, " ") + "; F; " + Utility.hex(fc, " ") + "; # " + name);
|
||||
if (!lc.equals(ch)) {
|
||||
log.println(Utility.hex(ch, " ") + "; S; " + Utility.hex(lc, " ") + "; # " + name);
|
||||
}
|
||||
}
|
||||
|
||||
// clear storage
|
||||
data.clear();
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
log.close();
|
||||
}
|
||||
}
|
||||
|
||||
static final String fixHack(String s) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
char last = '\u0000';
|
||||
int position = -1;
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
char c = s.charAt(i);
|
||||
if (position > 0) {
|
||||
if (c == ';') {
|
||||
int x = Integer.parseInt(s.substring(position,i),16);
|
||||
result.append(UTF32.valueOf32(x));
|
||||
position = -1;
|
||||
}
|
||||
} else {
|
||||
if (last == '#' && c == 'x') {
|
||||
result.setLength(result.length()-1); // remove '#'
|
||||
position = i+1;
|
||||
} else {
|
||||
result.append(c);
|
||||
}
|
||||
}
|
||||
last = c;
|
||||
}
|
||||
if (result != null) return result.toString();
|
||||
return s;
|
||||
}
|
||||
}
|
Loading…
Add table
Reference in a new issue