First check-in

X-SVN-Rev: 5636
This commit is contained in:
Mark Davis 2001-08-30 20:50:18 +00:00
parent b3321bad52
commit 1cd275c205
48 changed files with 20878 additions and 0 deletions

View file

@ -0,0 +1,216 @@
package com.ibm.text.UCA;
import com.ibm.text.UCD.*;
import com.ibm.text.utility.*;
public final class CEList implements java.lang.Comparable, UCD_Types {
int[] contents;
int startOffset;
int endOffset;
int count;
public CEList (int[] source, int start, int end) {
count = end-start;
contents = new int[count];
System.arraycopy(source, start, contents, 0, count);
startOffset = 0;
endOffset = count;
}
public CEList(int[] source) {
this(source, 0, source.length);
}
private CEList(int[] source, int start, int end, boolean spare) {
contents = source;
startOffset = start;
endOffset = end;
count = end - start;
}
public CEList append(CEList that) {
int[] newContents = new int[count + that.count];
System.arraycopy(contents, startOffset, newContents, 0, count);
System.arraycopy(that.contents, that.startOffset, newContents, count, that.count);
return new CEList(newContents, 0, count + that.count, true);
}
public CEList sub(int start, int end) {
return new CEList(contents, start, end, true);
}
public CEList start(int end) {
return new CEList(contents, 0, end, true);
}
public CEList end(int start) {
return new CEList(contents, start, contents.length, true);
}
public int length() {
return count;
}
public int at(int i) {
i -= startOffset;
if (i < 0 || i >= count) throw new ArrayIndexOutOfBoundsException(i);
return contents[i];
}
public int hashCode() {
int result = count;
for (int i = startOffset; i < endOffset; ++i) {
result *= 37;
result += contents[i];
}
return result;
}
public boolean equals(Object other) {
try {
CEList that = (CEList)other;
if (count != that.count) return false;
int delta = that.startOffset - startOffset;
for (int i = startOffset; i < endOffset; ++i) {
if (contents[i] != that.contents[i + delta]) return false;
}
return true;
} catch (Exception e) {
return false;
}
}
public int compareTo(Object other) {
CEList that = (CEList)other;
try {
int delta = that.startOffset - startOffset;
int min = endOffset;
int min2 = that.endOffset - delta;
if (min > min2) min = min2;
for (int i = startOffset; i < min; ++i) {
if (contents[i] != that.contents[i + delta]) {
if (contents[i] < that.contents[i + delta]) return -1;
return 1;
}
}
if (count < that.count) return -1;
if (count > that.count) return 1;
return 0;
} catch (RuntimeException e) {
System.out.println("This: " + this + ", that: " + other);
System.out.println(startOffset + ", " + endOffset
+ ", " + count + ", " + contents.length);
System.out.println(that.startOffset + ", " + that.endOffset
+ ", " + that.count + ", " + that.contents.length);
throw e;
}
}
public static byte remap(int ch, byte type, int t) {
if (type != CANONICAL) {
if (0x3041 <= ch && ch <= 0x3094) t = 0xE; // hiragana
else if (0x30A1 <= ch && ch <= 0x30FA) t = 0x11; // katakana
}
switch (type) {
case COMPATIBILITY: t = (t == 8) ? 0xA : 4; break;
case COMPAT_FONT: t = (t == 8) ? 0xB : 5; break;
case COMPAT_NOBREAK: t = 0x1B; break;
case COMPAT_INITIAL: t = 0x17; break;
case COMPAT_MEDIAL: t = 0x18; break;
case COMPAT_FINAL: t = 0x19; break;
case COMPAT_ISOLATED: t = 0x1A; break;
case COMPAT_CIRCLE: t = (t == 0x11) ? 0x13 : (t == 8) ? 0xC : 6; break;
case COMPAT_SUPER: t = 0x14; break;
case COMPAT_SUB: t = 0x15; break;
case COMPAT_VERTICAL: t = 0x16; break;
case COMPAT_WIDE: t= (t == 8) ? 9 : 3; break;
case COMPAT_NARROW: t = (0xFF67 <= ch && ch <= 0xFF6F) ? 0x10 : 0x12; break;
case COMPAT_SMALL: t = (t == 0xE) ? 0xE : 0xF; break;
case COMPAT_SQUARE: t = (t == 8) ? 0x1D : 0x1C; break;
case COMPAT_FRACTION: t = 0x1E; break;
}
return (byte)t;
}
public String toString() {
StringBuffer result = new StringBuffer();
for (int i = startOffset; i < endOffset; ++i) {
if (i != startOffset) result.append(' ');
result.append(toString(contents[i]));
}
return result.toString();
}
public static String toString(int ce) {
return "[" + Utility.hex(UCA.getPrimary(ce)) + "."
+ Utility.hex(UCA.getSecondary(ce)) + "."
+ Utility.hex(UCA.getTertiary(ce)) + "](" + NAME3[UCA.getTertiary(ce)] + ")";
}
static final String[] NAME3 = {
"IGNORE", // 0
"BLK", // Unused?
"MIN",
"WIDE",
"COMPAT",
"FONT",
"CIRCLE",
"RES-2",
"CAP",
"WIDECAP",
"COMPATCAP",
"FONTCAP",
"CIRCLECAP",
"HIRA-SMALL",
"HIRA",
"SMALL",
"SMALL-NARROW",
"KATA",
"NARROW",
"CIRCLE-KATA",
"SUP-MNN",
"SUB-MNS",
"VERT", // Missing??
"AINI",
"AMED",
"AFIN",
"AISO",
"NOBREAK", // Missing?
"SQUARED",
"SQUAREDCAP",
"FRACTION",
"MAX"
};
// testing
public static void main(String args[]) throws Exception {
/* This: [0241.0020.0004], that: [0F6B.0020.0002]
1, 2, 1, 2
0, 1, 1, 1
*/
CEList t1 = new CEList(new int[] {0, 0x02412004});
t1 = t1.sub(1,2);
CEList t2 = new CEList(new int[] {0x0F6B2002});
System.out.println(t1.compareTo(t2));
CEList foo = new CEList(new int[] {0, 1, 2, 3, 4});
CEList fuu = new CEList(new int[] {});
int cc = foo.compareTo(fuu);
System.out.println(cc);
System.out.println(foo);
System.out.println(foo.start(2));
System.out.println(foo.end(1));
CEList fii = new CEList(new int[] {2, 3});
CEList foo2 = foo.sub(2,4);
System.out.println(fii.equals(foo2));
System.out.println(fii.compareTo(foo2));
System.out.println(fii.compareTo(foo));
System.out.println(fii.hashCode() == foo2.hashCode());
}
}

View file

@ -0,0 +1,813 @@
package com.ibm.text.UCA;
public final class Case {
static StringBuffer out = new StringBuffer();
static String fold(char c) {
return fold(String.valueOf(c));
}
static String fold(String in) {
synchronized (out) {
out.setLength(0);
for (int i = 0; i < in.length(); ++i) {
char c = in.charAt(i);
String f = CF[c];
if (f == null) out.append(c);
else out.append(f);
}
return out.toString();
}
}
static String[] CF = new String[65536];
static {
CF[0x0041]="\u0061";
CF[0x0042]="\u0062";
CF[0x0043]="\u0063";
CF[0x0044]="\u0064";
CF[0x0045]="\u0065";
CF[0x0046]="\u0066";
CF[0x0047]="\u0067";
CF[0x0048]="\u0068";
CF[0x0049]="\u0069";
CF[0x004A]="\u006A";
CF[0x004B]="\u006B";
CF[0x004C]="\u006C";
CF[0x004D]="\u006D";
CF[0x004E]="\u006E";
CF[0x004F]="\u006F";
CF[0x0050]="\u0070";
CF[0x0051]="\u0071";
CF[0x0052]="\u0072";
CF[0x0053]="\u0073";
CF[0x0054]="\u0074";
CF[0x0055]="\u0075";
CF[0x0056]="\u0076";
CF[0x0057]="\u0077";
CF[0x0058]="\u0078";
CF[0x0059]="\u0079";
CF[0x005A]="\u007A";
CF[0x00B5]="\u03BC";
CF[0x00C0]="\u00E0";
CF[0x00C1]="\u00E1";
CF[0x00C2]="\u00E2";
CF[0x00C3]="\u00E3";
CF[0x00C4]="\u00E4";
CF[0x00C5]="\u00E5";
CF[0x00C6]="\u00E6";
CF[0x00C7]="\u00E7";
CF[0x00C8]="\u00E8";
CF[0x00C9]="\u00E9";
CF[0x00CA]="\u00EA";
CF[0x00CB]="\u00EB";
CF[0x00CC]="\u00EC";
CF[0x00CD]="\u00ED";
CF[0x00CE]="\u00EE";
CF[0x00CF]="\u00EF";
CF[0x00D0]="\u00F0";
CF[0x00D1]="\u00F1";
CF[0x00D2]="\u00F2";
CF[0x00D3]="\u00F3";
CF[0x00D4]="\u00F4";
CF[0x00D5]="\u00F5";
CF[0x00D6]="\u00F6";
CF[0x00D8]="\u00F8";
CF[0x00D9]="\u00F9";
CF[0x00DA]="\u00FA";
CF[0x00DB]="\u00FB";
CF[0x00DC]="\u00FC";
CF[0x00DD]="\u00FD";
CF[0x00DE]="\u00FE";
CF[0x00DF]="\u0073\u0073";
CF[0x0100]="\u0101";
CF[0x0102]="\u0103";
CF[0x0104]="\u0105";
CF[0x0106]="\u0107";
CF[0x0108]="\u0109";
CF[0x010A]="\u010B";
CF[0x010C]="\u010D";
CF[0x010E]="\u010F";
CF[0x0110]="\u0111";
CF[0x0112]="\u0113";
CF[0x0114]="\u0115";
CF[0x0116]="\u0117";
CF[0x0118]="\u0119";
CF[0x011A]="\u011B";
CF[0x011C]="\u011D";
CF[0x011E]="\u011F";
CF[0x0120]="\u0121";
CF[0x0122]="\u0123";
CF[0x0124]="\u0125";
CF[0x0126]="\u0127";
CF[0x0128]="\u0129";
CF[0x012A]="\u012B";
CF[0x012C]="\u012D";
CF[0x012E]="\u012F";
CF[0x0130]="\u0069";
CF[0x0131]="\u0069";
CF[0x0132]="\u0133";
CF[0x0134]="\u0135";
CF[0x0136]="\u0137";
CF[0x0139]="\u013A";
CF[0x013B]="\u013C";
CF[0x013D]="\u013E";
CF[0x013F]="\u0140";
CF[0x0141]="\u0142";
CF[0x0143]="\u0144";
CF[0x0145]="\u0146";
CF[0x0147]="\u0148";
CF[0x0149]="\u02BC\u006E";
CF[0x014A]="\u014B";
CF[0x014C]="\u014D";
CF[0x014E]="\u014F";
CF[0x0150]="\u0151";
CF[0x0152]="\u0153";
CF[0x0154]="\u0155";
CF[0x0156]="\u0157";
CF[0x0158]="\u0159";
CF[0x015A]="\u015B";
CF[0x015C]="\u015D";
CF[0x015E]="\u015F";
CF[0x0160]="\u0161";
CF[0x0162]="\u0163";
CF[0x0164]="\u0165";
CF[0x0166]="\u0167";
CF[0x0168]="\u0169";
CF[0x016A]="\u016B";
CF[0x016C]="\u016D";
CF[0x016E]="\u016F";
CF[0x0170]="\u0171";
CF[0x0172]="\u0173";
CF[0x0174]="\u0175";
CF[0x0176]="\u0177";
CF[0x0178]="\u00FF";
CF[0x0179]="\u017A";
CF[0x017B]="\u017C";
CF[0x017D]="\u017E";
CF[0x017F]="\u0073";
CF[0x0181]="\u0253";
CF[0x0182]="\u0183";
CF[0x0184]="\u0185";
CF[0x0186]="\u0254";
CF[0x0187]="\u0188";
CF[0x0189]="\u0256";
CF[0x018A]="\u0257";
CF[0x018B]="\u018C";
CF[0x018E]="\u01DD";
CF[0x018F]="\u0259";
CF[0x0190]="\u025B";
CF[0x0191]="\u0192";
CF[0x0193]="\u0260";
CF[0x0194]="\u0263";
CF[0x0196]="\u0269";
CF[0x0197]="\u0268";
CF[0x0198]="\u0199";
CF[0x019C]="\u026F";
CF[0x019D]="\u0272";
CF[0x019F]="\u0275";
CF[0x01A0]="\u01A1";
CF[0x01A2]="\u01A3";
CF[0x01A4]="\u01A5";
CF[0x01A6]="\u0280";
CF[0x01A7]="\u01A8";
CF[0x01A9]="\u0283";
CF[0x01AC]="\u01AD";
CF[0x01AE]="\u0288";
CF[0x01AF]="\u01B0";
CF[0x01B1]="\u028A";
CF[0x01B2]="\u028B";
CF[0x01B3]="\u01B4";
CF[0x01B5]="\u01B6";
CF[0x01B7]="\u0292";
CF[0x01B8]="\u01B9";
CF[0x01BC]="\u01BD";
CF[0x01C4]="\u01C6";
CF[0x01C5]="\u01C6";
CF[0x01C7]="\u01C9";
CF[0x01C8]="\u01C9";
CF[0x01CA]="\u01CC";
CF[0x01CB]="\u01CC";
CF[0x01CD]="\u01CE";
CF[0x01CF]="\u01D0";
CF[0x01D1]="\u01D2";
CF[0x01D3]="\u01D4";
CF[0x01D5]="\u01D6";
CF[0x01D7]="\u01D8";
CF[0x01D9]="\u01DA";
CF[0x01DB]="\u01DC";
CF[0x01DE]="\u01DF";
CF[0x01E0]="\u01E1";
CF[0x01E2]="\u01E3";
CF[0x01E4]="\u01E5";
CF[0x01E6]="\u01E7";
CF[0x01E8]="\u01E9";
CF[0x01EA]="\u01EB";
CF[0x01EC]="\u01ED";
CF[0x01EE]="\u01EF";
CF[0x01F0]="\u006A\u030C";
CF[0x01F1]="\u01F3";
CF[0x01F2]="\u01F3";
CF[0x01F4]="\u01F5";
CF[0x01F6]="\u0195";
CF[0x01F7]="\u01BF";
CF[0x01F8]="\u01F9";
CF[0x01FA]="\u01FB";
CF[0x01FC]="\u01FD";
CF[0x01FE]="\u01FF";
CF[0x0200]="\u0201";
CF[0x0202]="\u0203";
CF[0x0204]="\u0205";
CF[0x0206]="\u0207";
CF[0x0208]="\u0209";
CF[0x020A]="\u020B";
CF[0x020C]="\u020D";
CF[0x020E]="\u020F";
CF[0x0210]="\u0211";
CF[0x0212]="\u0213";
CF[0x0214]="\u0215";
CF[0x0216]="\u0217";
CF[0x0218]="\u0219";
CF[0x021A]="\u021B";
CF[0x021C]="\u021D";
CF[0x021E]="\u021F";
CF[0x0222]="\u0223";
CF[0x0224]="\u0225";
CF[0x0226]="\u0227";
CF[0x0228]="\u0229";
CF[0x022A]="\u022B";
CF[0x022C]="\u022D";
CF[0x022E]="\u022F";
CF[0x0230]="\u0231";
CF[0x0232]="\u0233";
CF[0x0345]="\u03B9";
CF[0x0386]="\u03AC";
CF[0x0388]="\u03AD";
CF[0x0389]="\u03AE";
CF[0x038A]="\u03AF";
CF[0x038C]="\u03CC";
CF[0x038E]="\u03CD";
CF[0x038F]="\u03CE";
CF[0x0390]="\u03B9\u0308\u0301";
CF[0x0391]="\u03B1";
CF[0x0392]="\u03B2";
CF[0x0393]="\u03B3";
CF[0x0394]="\u03B4";
CF[0x0395]="\u03B5";
CF[0x0396]="\u03B6";
CF[0x0397]="\u03B7";
CF[0x0398]="\u03B8";
CF[0x0399]="\u03B9";
CF[0x039A]="\u03BA";
CF[0x039B]="\u03BB";
CF[0x039C]="\u03BC";
CF[0x039D]="\u03BD";
CF[0x039E]="\u03BE";
CF[0x039F]="\u03BF";
CF[0x03A0]="\u03C0";
CF[0x03A1]="\u03C1";
CF[0x03A3]="\u03C2";
CF[0x03A4]="\u03C4";
CF[0x03A5]="\u03C5";
CF[0x03A6]="\u03C6";
CF[0x03A7]="\u03C7";
CF[0x03A8]="\u03C8";
CF[0x03A9]="\u03C9";
CF[0x03AA]="\u03CA";
CF[0x03AB]="\u03CB";
CF[0x03B0]="\u03C5\u0308\u0301";
CF[0x03C3]="\u03C2";
CF[0x03D0]="\u03B2";
CF[0x03D1]="\u03B8";
CF[0x03D5]="\u03C6";
CF[0x03D6]="\u03C0";
CF[0x03DA]="\u03DB";
CF[0x03DC]="\u03DD";
CF[0x03DE]="\u03DF";
CF[0x03E0]="\u03E1";
CF[0x03E2]="\u03E3";
CF[0x03E4]="\u03E5";
CF[0x03E6]="\u03E7";
CF[0x03E8]="\u03E9";
CF[0x03EA]="\u03EB";
CF[0x03EC]="\u03ED";
CF[0x03EE]="\u03EF";
CF[0x03F0]="\u03BA";
CF[0x03F1]="\u03C1";
CF[0x03F2]="\u03C2";
CF[0x0400]="\u0450";
CF[0x0401]="\u0451";
CF[0x0402]="\u0452";
CF[0x0403]="\u0453";
CF[0x0404]="\u0454";
CF[0x0405]="\u0455";
CF[0x0406]="\u0456";
CF[0x0407]="\u0457";
CF[0x0408]="\u0458";
CF[0x0409]="\u0459";
CF[0x040A]="\u045A";
CF[0x040B]="\u045B";
CF[0x040C]="\u045C";
CF[0x040D]="\u045D";
CF[0x040E]="\u045E";
CF[0x040F]="\u045F";
CF[0x0410]="\u0430";
CF[0x0411]="\u0431";
CF[0x0412]="\u0432";
CF[0x0413]="\u0433";
CF[0x0414]="\u0434";
CF[0x0415]="\u0435";
CF[0x0416]="\u0436";
CF[0x0417]="\u0437";
CF[0x0418]="\u0438";
CF[0x0419]="\u0439";
CF[0x041A]="\u043A";
CF[0x041B]="\u043B";
CF[0x041C]="\u043C";
CF[0x041D]="\u043D";
CF[0x041E]="\u043E";
CF[0x041F]="\u043F";
CF[0x0420]="\u0440";
CF[0x0421]="\u0441";
CF[0x0422]="\u0442";
CF[0x0423]="\u0443";
CF[0x0424]="\u0444";
CF[0x0425]="\u0445";
CF[0x0426]="\u0446";
CF[0x0427]="\u0447";
CF[0x0428]="\u0448";
CF[0x0429]="\u0449";
CF[0x042A]="\u044A";
CF[0x042B]="\u044B";
CF[0x042C]="\u044C";
CF[0x042D]="\u044D";
CF[0x042E]="\u044E";
CF[0x042F]="\u044F";
CF[0x0460]="\u0461";
CF[0x0462]="\u0463";
CF[0x0464]="\u0465";
CF[0x0466]="\u0467";
CF[0x0468]="\u0469";
CF[0x046A]="\u046B";
CF[0x046C]="\u046D";
CF[0x046E]="\u046F";
CF[0x0470]="\u0471";
CF[0x0472]="\u0473";
CF[0x0474]="\u0475";
CF[0x0476]="\u0477";
CF[0x0478]="\u0479";
CF[0x047A]="\u047B";
CF[0x047C]="\u047D";
CF[0x047E]="\u047F";
CF[0x0480]="\u0481";
CF[0x048C]="\u048D";
CF[0x048E]="\u048F";
CF[0x0490]="\u0491";
CF[0x0492]="\u0493";
CF[0x0494]="\u0495";
CF[0x0496]="\u0497";
CF[0x0498]="\u0499";
CF[0x049A]="\u049B";
CF[0x049C]="\u049D";
CF[0x049E]="\u049F";
CF[0x04A0]="\u04A1";
CF[0x04A2]="\u04A3";
CF[0x04A4]="\u04A5";
CF[0x04A6]="\u04A7";
CF[0x04A8]="\u04A9";
CF[0x04AA]="\u04AB";
CF[0x04AC]="\u04AD";
CF[0x04AE]="\u04AF";
CF[0x04B0]="\u04B1";
CF[0x04B2]="\u04B3";
CF[0x04B4]="\u04B5";
CF[0x04B6]="\u04B7";
CF[0x04B8]="\u04B9";
CF[0x04BA]="\u04BB";
CF[0x04BC]="\u04BD";
CF[0x04BE]="\u04BF";
CF[0x04C1]="\u04C2";
CF[0x04C3]="\u04C4";
CF[0x04C7]="\u04C8";
CF[0x04CB]="\u04CC";
CF[0x04D0]="\u04D1";
CF[0x04D2]="\u04D3";
CF[0x04D4]="\u04D5";
CF[0x04D6]="\u04D7";
CF[0x04D8]="\u04D9";
CF[0x04DA]="\u04DB";
CF[0x04DC]="\u04DD";
CF[0x04DE]="\u04DF";
CF[0x04E0]="\u04E1";
CF[0x04E2]="\u04E3";
CF[0x04E4]="\u04E5";
CF[0x04E6]="\u04E7";
CF[0x04E8]="\u04E9";
CF[0x04EA]="\u04EB";
CF[0x04EC]="\u04ED";
CF[0x04EE]="\u04EF";
CF[0x04F0]="\u04F1";
CF[0x04F2]="\u04F3";
CF[0x04F4]="\u04F5";
CF[0x04F8]="\u04F9";
CF[0x0531]="\u0561";
CF[0x0532]="\u0562";
CF[0x0533]="\u0563";
CF[0x0534]="\u0564";
CF[0x0535]="\u0565";
CF[0x0536]="\u0566";
CF[0x0537]="\u0567";
CF[0x0538]="\u0568";
CF[0x0539]="\u0569";
CF[0x053A]="\u056A";
CF[0x053B]="\u056B";
CF[0x053C]="\u056C";
CF[0x053D]="\u056D";
CF[0x053E]="\u056E";
CF[0x053F]="\u056F";
CF[0x0540]="\u0570";
CF[0x0541]="\u0571";
CF[0x0542]="\u0572";
CF[0x0543]="\u0573";
CF[0x0544]="\u0574";
CF[0x0545]="\u0575";
CF[0x0546]="\u0576";
CF[0x0547]="\u0577";
CF[0x0548]="\u0578";
CF[0x0549]="\u0579";
CF[0x054A]="\u057A";
CF[0x054B]="\u057B";
CF[0x054C]="\u057C";
CF[0x054D]="\u057D";
CF[0x054E]="\u057E";
CF[0x054F]="\u057F";
CF[0x0550]="\u0580";
CF[0x0551]="\u0581";
CF[0x0552]="\u0582";
CF[0x0553]="\u0583";
CF[0x0554]="\u0584";
CF[0x0555]="\u0585";
CF[0x0556]="\u0586";
CF[0x0587]="\u0565\u0582";
CF[0x1E00]="\u1E01";
CF[0x1E02]="\u1E03";
CF[0x1E04]="\u1E05";
CF[0x1E06]="\u1E07";
CF[0x1E08]="\u1E09";
CF[0x1E0A]="\u1E0B";
CF[0x1E0C]="\u1E0D";
CF[0x1E0E]="\u1E0F";
CF[0x1E10]="\u1E11";
CF[0x1E12]="\u1E13";
CF[0x1E14]="\u1E15";
CF[0x1E16]="\u1E17";
CF[0x1E18]="\u1E19";
CF[0x1E1A]="\u1E1B";
CF[0x1E1C]="\u1E1D";
CF[0x1E1E]="\u1E1F";
CF[0x1E20]="\u1E21";
CF[0x1E22]="\u1E23";
CF[0x1E24]="\u1E25";
CF[0x1E26]="\u1E27";
CF[0x1E28]="\u1E29";
CF[0x1E2A]="\u1E2B";
CF[0x1E2C]="\u1E2D";
CF[0x1E2E]="\u1E2F";
CF[0x1E30]="\u1E31";
CF[0x1E32]="\u1E33";
CF[0x1E34]="\u1E35";
CF[0x1E36]="\u1E37";
CF[0x1E38]="\u1E39";
CF[0x1E3A]="\u1E3B";
CF[0x1E3C]="\u1E3D";
CF[0x1E3E]="\u1E3F";
CF[0x1E40]="\u1E41";
CF[0x1E42]="\u1E43";
CF[0x1E44]="\u1E45";
CF[0x1E46]="\u1E47";
CF[0x1E48]="\u1E49";
CF[0x1E4A]="\u1E4B";
CF[0x1E4C]="\u1E4D";
CF[0x1E4E]="\u1E4F";
CF[0x1E50]="\u1E51";
CF[0x1E52]="\u1E53";
CF[0x1E54]="\u1E55";
CF[0x1E56]="\u1E57";
CF[0x1E58]="\u1E59";
CF[0x1E5A]="\u1E5B";
CF[0x1E5C]="\u1E5D";
CF[0x1E5E]="\u1E5F";
CF[0x1E60]="\u1E61";
CF[0x1E62]="\u1E63";
CF[0x1E64]="\u1E65";
CF[0x1E66]="\u1E67";
CF[0x1E68]="\u1E69";
CF[0x1E6A]="\u1E6B";
CF[0x1E6C]="\u1E6D";
CF[0x1E6E]="\u1E6F";
CF[0x1E70]="\u1E71";
CF[0x1E72]="\u1E73";
CF[0x1E74]="\u1E75";
CF[0x1E76]="\u1E77";
CF[0x1E78]="\u1E79";
CF[0x1E7A]="\u1E7B";
CF[0x1E7C]="\u1E7D";
CF[0x1E7E]="\u1E7F";
CF[0x1E80]="\u1E81";
CF[0x1E82]="\u1E83";
CF[0x1E84]="\u1E85";
CF[0x1E86]="\u1E87";
CF[0x1E88]="\u1E89";
CF[0x1E8A]="\u1E8B";
CF[0x1E8C]="\u1E8D";
CF[0x1E8E]="\u1E8F";
CF[0x1E90]="\u1E91";
CF[0x1E92]="\u1E93";
CF[0x1E94]="\u1E95";
CF[0x1E96]="\u0068\u0331";
CF[0x1E97]="\u0074\u0308";
CF[0x1E98]="\u0077\u030A";
CF[0x1E99]="\u0079\u030A";
CF[0x1E9A]="\u0061\u02BE";
CF[0x1E9B]="\u1E61";
CF[0x1EA0]="\u1EA1";
CF[0x1EA2]="\u1EA3";
CF[0x1EA4]="\u1EA5";
CF[0x1EA6]="\u1EA7";
CF[0x1EA8]="\u1EA9";
CF[0x1EAA]="\u1EAB";
CF[0x1EAC]="\u1EAD";
CF[0x1EAE]="\u1EAF";
CF[0x1EB0]="\u1EB1";
CF[0x1EB2]="\u1EB3";
CF[0x1EB4]="\u1EB5";
CF[0x1EB6]="\u1EB7";
CF[0x1EB8]="\u1EB9";
CF[0x1EBA]="\u1EBB";
CF[0x1EBC]="\u1EBD";
CF[0x1EBE]="\u1EBF";
CF[0x1EC0]="\u1EC1";
CF[0x1EC2]="\u1EC3";
CF[0x1EC4]="\u1EC5";
CF[0x1EC6]="\u1EC7";
CF[0x1EC8]="\u1EC9";
CF[0x1ECA]="\u1ECB";
CF[0x1ECC]="\u1ECD";
CF[0x1ECE]="\u1ECF";
CF[0x1ED0]="\u1ED1";
CF[0x1ED2]="\u1ED3";
CF[0x1ED4]="\u1ED5";
CF[0x1ED6]="\u1ED7";
CF[0x1ED8]="\u1ED9";
CF[0x1EDA]="\u1EDB";
CF[0x1EDC]="\u1EDD";
CF[0x1EDE]="\u1EDF";
CF[0x1EE0]="\u1EE1";
CF[0x1EE2]="\u1EE3";
CF[0x1EE4]="\u1EE5";
CF[0x1EE6]="\u1EE7";
CF[0x1EE8]="\u1EE9";
CF[0x1EEA]="\u1EEB";
CF[0x1EEC]="\u1EED";
CF[0x1EEE]="\u1EEF";
CF[0x1EF0]="\u1EF1";
CF[0x1EF2]="\u1EF3";
CF[0x1EF4]="\u1EF5";
CF[0x1EF6]="\u1EF7";
CF[0x1EF8]="\u1EF9";
CF[0x1F08]="\u1F00";
CF[0x1F09]="\u1F01";
CF[0x1F0A]="\u1F02";
CF[0x1F0B]="\u1F03";
CF[0x1F0C]="\u1F04";
CF[0x1F0D]="\u1F05";
CF[0x1F0E]="\u1F06";
CF[0x1F0F]="\u1F07";
CF[0x1F18]="\u1F10";
CF[0x1F19]="\u1F11";
CF[0x1F1A]="\u1F12";
CF[0x1F1B]="\u1F13";
CF[0x1F1C]="\u1F14";
CF[0x1F1D]="\u1F15";
CF[0x1F28]="\u1F20";
CF[0x1F29]="\u1F21";
CF[0x1F2A]="\u1F22";
CF[0x1F2B]="\u1F23";
CF[0x1F2C]="\u1F24";
CF[0x1F2D]="\u1F25";
CF[0x1F2E]="\u1F26";
CF[0x1F2F]="\u1F27";
CF[0x1F38]="\u1F30";
CF[0x1F39]="\u1F31";
CF[0x1F3A]="\u1F32";
CF[0x1F3B]="\u1F33";
CF[0x1F3C]="\u1F34";
CF[0x1F3D]="\u1F35";
CF[0x1F3E]="\u1F36";
CF[0x1F3F]="\u1F37";
CF[0x1F48]="\u1F40";
CF[0x1F49]="\u1F41";
CF[0x1F4A]="\u1F42";
CF[0x1F4B]="\u1F43";
CF[0x1F4C]="\u1F44";
CF[0x1F4D]="\u1F45";
CF[0x1F50]="\u03C5\u0313";
CF[0x1F52]="\u03C5\u0313\u0300";
CF[0x1F54]="\u03C5\u0313\u0301";
CF[0x1F56]="\u03C5\u0313\u0342";
CF[0x1F59]="\u1F51";
CF[0x1F5B]="\u1F53";
CF[0x1F5D]="\u1F55";
CF[0x1F5F]="\u1F57";
CF[0x1F68]="\u1F60";
CF[0x1F69]="\u1F61";
CF[0x1F6A]="\u1F62";
CF[0x1F6B]="\u1F63";
CF[0x1F6C]="\u1F64";
CF[0x1F6D]="\u1F65";
CF[0x1F6E]="\u1F66";
CF[0x1F6F]="\u1F67";
CF[0x1F80]="\u1F00\u03B9";
CF[0x1F81]="\u1F01\u03B9";
CF[0x1F82]="\u1F02\u03B9";
CF[0x1F83]="\u1F03\u03B9";
CF[0x1F84]="\u1F04\u03B9";
CF[0x1F85]="\u1F05\u03B9";
CF[0x1F86]="\u1F06\u03B9";
CF[0x1F87]="\u1F07\u03B9";
CF[0x1F88]="\u1F00\u03B9";
CF[0x1F89]="\u1F01\u03B9";
CF[0x1F8A]="\u1F02\u03B9";
CF[0x1F8B]="\u1F03\u03B9";
CF[0x1F8C]="\u1F04\u03B9";
CF[0x1F8D]="\u1F05\u03B9";
CF[0x1F8E]="\u1F06\u03B9";
CF[0x1F8F]="\u1F07\u03B9";
CF[0x1F90]="\u1F20\u03B9";
CF[0x1F91]="\u1F21\u03B9";
CF[0x1F92]="\u1F22\u03B9";
CF[0x1F93]="\u1F23\u03B9";
CF[0x1F94]="\u1F24\u03B9";
CF[0x1F95]="\u1F25\u03B9";
CF[0x1F96]="\u1F26\u03B9";
CF[0x1F97]="\u1F27\u03B9";
CF[0x1F98]="\u1F20\u03B9";
CF[0x1F99]="\u1F21\u03B9";
CF[0x1F9A]="\u1F22\u03B9";
CF[0x1F9B]="\u1F23\u03B9";
CF[0x1F9C]="\u1F24\u03B9";
CF[0x1F9D]="\u1F25\u03B9";
CF[0x1F9E]="\u1F26\u03B9";
CF[0x1F9F]="\u1F27\u03B9";
CF[0x1FA0]="\u1F60\u03B9";
CF[0x1FA1]="\u1F61\u03B9";
CF[0x1FA2]="\u1F62\u03B9";
CF[0x1FA3]="\u1F63\u03B9";
CF[0x1FA4]="\u1F64\u03B9";
CF[0x1FA5]="\u1F65\u03B9";
CF[0x1FA6]="\u1F66\u03B9";
CF[0x1FA7]="\u1F67\u03B9";
CF[0x1FA8]="\u1F60\u03B9";
CF[0x1FA9]="\u1F61\u03B9";
CF[0x1FAA]="\u1F62\u03B9";
CF[0x1FAB]="\u1F63\u03B9";
CF[0x1FAC]="\u1F64\u03B9";
CF[0x1FAD]="\u1F65\u03B9";
CF[0x1FAE]="\u1F66\u03B9";
CF[0x1FAF]="\u1F67\u03B9";
CF[0x1FB2]="\u1F70\u03B9";
CF[0x1FB3]="\u03B1\u03B9";
CF[0x1FB4]="\u03AC\u03B9";
CF[0x1FB6]="\u03B1\u0342";
CF[0x1FB7]="\u03B1\u0342\u03B9";
CF[0x1FB8]="\u1FB0";
CF[0x1FB9]="\u1FB1";
CF[0x1FBA]="\u1F70";
CF[0x1FBB]="\u1F71";
CF[0x1FBC]="\u03B1\u03B9";
CF[0x1FBE]="\u03B9";
CF[0x1FC2]="\u1F74\u03B9";
CF[0x1FC3]="\u03B7\u03B9";
CF[0x1FC4]="\u03AE\u03B9";
CF[0x1FC6]="\u03B7\u0342";
CF[0x1FC7]="\u03B7\u0342\u03B9";
CF[0x1FC8]="\u1F72";
CF[0x1FC9]="\u1F73";
CF[0x1FCA]="\u1F74";
CF[0x1FCB]="\u1F75";
CF[0x1FCC]="\u03B7\u03B9";
CF[0x1FD2]="\u03B9\u0308\u0300";
CF[0x1FD3]="\u03B9\u0308\u0301";
CF[0x1FD6]="\u03B9\u0342";
CF[0x1FD7]="\u03B9\u0308\u0342";
CF[0x1FD8]="\u1FD0";
CF[0x1FD9]="\u1FD1";
CF[0x1FDA]="\u1F76";
CF[0x1FDB]="\u1F77";
CF[0x1FE2]="\u03C5\u0308\u0300";
CF[0x1FE3]="\u03C5\u0308\u0301";
CF[0x1FE4]="\u03C1\u0313";
CF[0x1FE6]="\u03C5\u0342";
CF[0x1FE7]="\u03C5\u0308\u0342";
CF[0x1FE8]="\u1FE0";
CF[0x1FE9]="\u1FE1";
CF[0x1FEA]="\u1F7A";
CF[0x1FEB]="\u1F7B";
CF[0x1FEC]="\u1FE5";
CF[0x1FF2]="\u1F7C\u03B9";
CF[0x1FF3]="\u03C9\u03B9";
CF[0x1FF4]="\u03CE\u03B9";
CF[0x1FF6]="\u03C9\u0342";
CF[0x1FF7]="\u03C9\u0342\u03B9";
CF[0x1FF8]="\u1F78";
CF[0x1FF9]="\u1F79";
CF[0x1FFA]="\u1F7C";
CF[0x1FFB]="\u1F7D";
CF[0x1FFC]="\u03C9\u03B9";
CF[0x2126]="\u03C9";
CF[0x212A]="\u006B";
CF[0x212B]="\u00E5";
CF[0x2160]="\u2170";
CF[0x2161]="\u2171";
CF[0x2162]="\u2172";
CF[0x2163]="\u2173";
CF[0x2164]="\u2174";
CF[0x2165]="\u2175";
CF[0x2166]="\u2176";
CF[0x2167]="\u2177";
CF[0x2168]="\u2178";
CF[0x2169]="\u2179";
CF[0x216A]="\u217A";
CF[0x216B]="\u217B";
CF[0x216C]="\u217C";
CF[0x216D]="\u217D";
CF[0x216E]="\u217E";
CF[0x216F]="\u217F";
CF[0x24B6]="\u24D0";
CF[0x24B7]="\u24D1";
CF[0x24B8]="\u24D2";
CF[0x24B9]="\u24D3";
CF[0x24BA]="\u24D4";
CF[0x24BB]="\u24D5";
CF[0x24BC]="\u24D6";
CF[0x24BD]="\u24D7";
CF[0x24BE]="\u24D8";
CF[0x24BF]="\u24D9";
CF[0x24C0]="\u24DA";
CF[0x24C1]="\u24DB";
CF[0x24C2]="\u24DC";
CF[0x24C3]="\u24DD";
CF[0x24C4]="\u24DE";
CF[0x24C5]="\u24DF";
CF[0x24C6]="\u24E0";
CF[0x24C7]="\u24E1";
CF[0x24C8]="\u24E2";
CF[0x24C9]="\u24E3";
CF[0x24CA]="\u24E4";
CF[0x24CB]="\u24E5";
CF[0x24CC]="\u24E6";
CF[0x24CD]="\u24E7";
CF[0x24CE]="\u24E8";
CF[0x24CF]="\u24E9";
CF[0xFB00]="\u0066\u0066";
CF[0xFB01]="\u0066\u0069";
CF[0xFB02]="\u0066\u006C";
CF[0xFB03]="\u0066\u0066\u0069";
CF[0xFB04]="\u0066\u0066\u006C";
CF[0xFB05]="\u0073\u0074";
CF[0xFB06]="\u0073\u0074";
CF[0xFB13]="\u0574\u0576";
CF[0xFB14]="\u0574\u0565";
CF[0xFB15]="\u0574\u056B";
CF[0xFB16]="\u057E\u0576";
CF[0xFB17]="\u0574\u056D";
CF[0xFF21]="\uFF41";
CF[0xFF22]="\uFF42";
CF[0xFF23]="\uFF43";
CF[0xFF24]="\uFF44";
CF[0xFF25]="\uFF45";
CF[0xFF26]="\uFF46";
CF[0xFF27]="\uFF47";
CF[0xFF28]="\uFF48";
CF[0xFF29]="\uFF49";
CF[0xFF2A]="\uFF4A";
CF[0xFF2B]="\uFF4B";
CF[0xFF2C]="\uFF4C";
CF[0xFF2D]="\uFF4D";
CF[0xFF2E]="\uFF4E";
CF[0xFF2F]="\uFF4F";
CF[0xFF30]="\uFF50";
CF[0xFF31]="\uFF51";
CF[0xFF32]="\uFF52";
CF[0xFF33]="\uFF53";
CF[0xFF34]="\uFF54";
CF[0xFF35]="\uFF55";
CF[0xFF36]="\uFF56";
CF[0xFF37]="\uFF57";
CF[0xFF38]="\uFF58";
CF[0xFF39]="\uFF59";
CF[0xFF3A]="\uFF5A";
// 785 case foldings total
}
}

View file

@ -0,0 +1,490 @@
package com.ibm.text.UCA;
import java.util.*;
import java.io.*;
import com.ibm.text.UCD.*;
import com.ibm.text.utility.*;
import com.ibm.text.UTF16;
public class GenOverlap {
static Map completes = new TreeMap();
static Map back = new HashMap();
static Map initials = new HashMap();
static int[] ces = new int[50];
static UCA collator;
static UCD ucd;
static Normalizer nfd;
static Normalizer nfkd;
public static void test(UCA collatorIn) throws Exception {
collator = collatorIn;
CEList.main(null);
System.out.println("# Overlap");
System.out.println("# Generated " + new Date());
ucd = UCD.make();
nfd = new Normalizer(Normalizer.NFD);
nfkd = new Normalizer(Normalizer.NFKD);
UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
// store data for faster lookup
System.out.println("# Gathering Data");
int counter = 0;
int[] lenArray = new int[1];
while (true) {
Utility.dot(counter++);
String s = cc.next(ces, lenArray);
if (s == null) break;
int len = lenArray[0];
CEList currCEList = new CEList(ces, 0, len);
addString(s, currCEList);
}
for (int cp = 0x10000; cp <= 0x10FFFF; ++cp) {
if (!ucd.isRepresented(cp)) continue;
byte decompType = ucd.getDecompositionType(cp);
if (decompType >= UCD.COMPATIBILITY) {
String decomp = nfkd.normalize(cp);
CEList celist = getCEList(cp, decomp, decompType);
addString(decomp, celist);
System.out.println("Adding: " + ucd.getCodeAndName(cp) + "\t" + celist);
}
}
Utility.fixDot();
System.out.println("# Completes Count: " + completes.size());
System.out.println("# Initials Count: " + initials.size());
System.out.println("# Writing Overlaps");
// simpleList();
fullCheck();
}
public static void addString(String s, CEList currCEList) {
back.put(s, currCEList);
completes.put(currCEList, s);
for (int i = 1; i < currCEList.length(); ++i) {
CEList start = currCEList.start(i);
Set bag = (Set) initials.get(start);
if (bag == null) {
bag = new TreeSet();
initials.put(start, bag);
}
bag.add(s);
}
}
static void simpleList() {
Iterator it = completes.keySet().iterator();
int counter = 0;
int foundCount = 0;
while (it.hasNext()) {
Utility.dot(counter++);
// see if the ces for the current element are the start of something else
CEList key = (CEList) it.next();
String val = (String) completes.get(key);
Set probe = (Set) initials.get(key);
if (probe != null) {
Utility.fixDot();
foundCount++;
System.out.println("Possible Overlap: ");
System.out.println(" " + ucd.getCodeAndName(val));
System.out.println("\t" + key);
Iterator it2 = probe.iterator();
int count2 = 0;
while (it2.hasNext()) {
String match = (String) it2.next();
CEList ceList = (CEList) back.get(match);
System.out.println((count2++) + ". " + ucd.getCodeAndName(match));
System.out.println("\t" + ceList);
}
}
}
System.out.println("# Found Count: " + foundCount);
}
static boolean PROGRESS = false;
static void fullCheck() throws IOException {
PrintWriter log = Utility.openPrintWriter("Overlap.html");
PrintWriter simpleList = Utility.openPrintWriter("Overlap.txt");
Iterator it = completes.keySet().iterator();
int counter = 0;
int foundCount = 0;
String [] goalChars = new String[1];
String [] matchChars = new String[1];
// CEList show = getCEList("\u2034");
log.println("<html><head>");
log.println("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
log.println("<title>New Page 1</title>");
log.println("<style><!--");
log.println("table { border-style: solid; border-width: 1 }");
log.println("td { border-style: solid; border-width: 1 }");
log.println("--></style>");
log.println("</head><body><table>");
while (it.hasNext()) {
Utility.dot(counter++);
CEList key = (CEList) it.next();
if (key.length() < 2) continue;
String val = (String) completes.get(key);
goalChars[0] = "";
matchChars[0] = "";
if (matchWhole(val, key, 0, goalChars, matchChars)) {
simpleList.println(ucd.getCodeAndName(val));
goalChars[0] = val + goalChars[0]; // fix first char
if (!getCEList(goalChars[0]).equals(getCEList(matchChars[0]))) {
log.println("<tr><td colspan='6'>WARNING:" + getCEList(matchChars[0]) + "</td></tr>");
}
foundCount++;
log.println("<tr><td>" + val + "</td>");
log.println("<td>" + goalChars[0] + "</td>");
log.println("<td>" + matchChars[0] + "</td>");
log.println("<td>" + ucd.getCodeAndName(goalChars[0]) + "</td>");
log.println("<td>" + ucd.getCodeAndName(matchChars[0]) + "</td>");
log.println("<td>" + getCEList(goalChars[0]) + "</td></tr>");
//log.println("\t" + );
}
}
log.println("</tr></table>Number of Overlapping characters: " + foundCount + "</body>");
log.close();
simpleList.close();
}
static private CEList getCEList(String s) {
int len = collator.getCEs(s, true, ces);
return new CEList(ces, 0, len);
}
static private CEList getCEList(int originalChar, String s, byte type) {
int len = collator.getCEs(s, true, ces);
for (int i = 0; i < len; ++i) {
ces[i] = UCA.makeKey(UCA.getPrimary(ces[i]),
UCA.getSecondary(ces[i]),
CEList.remap(originalChar, type, UCA.getTertiary(ces[i])));
}
return new CEList(ces, 0, len);
}
static boolean matchWhole(String goalStr, CEList goal, int depth, String[] goalChars, String[] otherChars) {
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Trying: " + ucd.getCodeAndName(goalStr) + ", " + goal);
// to stop infinite loops, we limit the depth to 5
if (depth > 5) {
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "stack exhausted");
return false;
}
String match;
// There are 3 possible conditions. Any of which work.
// To eliminate double matches at the top level, we test depth > 0
if (depth > 0) {
// Condition 1.
// we have an exact match
match = (String) completes.get(goal);
if (match != null) {
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Exactly: " + ucd.getCodeAndName(match));
otherChars[0] = match + otherChars[0];
if (PROGRESS) System.out.println(Utility.repeat(". ", depth)
+ ucd.getCode(goalChars[0])
+ " / " + ucd.getCode(otherChars[0])
);
return true;
}
// Condition 2
// this whole string matches some initial portion of another string
// AND the remainder of that other string also does a matchWhole.
// Example: if we get the following, we search for a match to "de"
// abc...
// abcde
// If we find a match, we append to the strings, the string for abc
// and the one for abcde
Set probe = (Set) initials.get(goal);
if (probe != null) {
Iterator it2 = probe.iterator();
while (it2.hasNext()) {
match = (String) it2.next();
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Longer: " + ucd.getCodeAndName(match)
+ "\t\tswitching");
CEList trail = ((CEList) back.get(match)).end(goal.length());
boolean doesMatch = matchWhole(match, trail, depth+1, otherChars, goalChars);
if (doesMatch) {
otherChars[0] = match + otherChars[0];
if (PROGRESS) System.out.println(Utility.repeat(". ", depth)
+ ucd.getCode(goalChars[0])
+ " / " + ucd.getCode(otherChars[0])
);
return true;
}
}
}
}
// Condition 3
// the first part of this string matches a whole other string
// and the remainder of this string also does a matchWhole
// Example: if we get the following, we search for a match to "de"
// abcde..
// abc..
// if we find a match
for (int i = goal.length() - 1; i > 0; --i) {
CEList first = goal.start(i);
match = (String) completes.get(first);
if (match != null) {
if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Shorter: " + ucd.getCodeAndName(match));
boolean doesMatch = matchWhole("", goal.end(i), depth+1, goalChars, otherChars);
if (doesMatch) {
otherChars[0] = match + otherChars[0];
if (PROGRESS) System.out.println(Utility.repeat(". ", depth)
+ ucd.getCode(goalChars[0])
+ " / " + ucd.getCode(otherChars[0])
);
return true;
}
}
}
// if we get this far, we failed.
return false;
}
public static void generateRevision (UCA collatorIn) throws Exception {
generateRevision(collatorIn, false);
generateRevision(collatorIn, true);
}
public static void generateRevision (UCA collatorIn, boolean doMax) throws Exception {
collator = collatorIn;
CEList.main(null);
System.out.println("# Generate");
System.out.println("# Generated " + new Date());
ucd = UCD.make();
nfd = new Normalizer(Normalizer.NFD);
nfkd = new Normalizer(Normalizer.NFKD);
UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
// store data for faster lookup
System.out.println("# Gathering Data");
int counter = 0;
int[] lenArray = new int[1];
Set list = new TreeSet();
Map newCollisions = new HashMap();
Map oldCollisions = new HashMap();
Map newProblems = new TreeMap();
Map oldProblems = new TreeMap();
CEList nullCEList = new CEList(new int[1]);
while (true) {
Utility.dot(counter++);
String str = cc.next(ces, lenArray);
if (str == null) break;
int len = lenArray[0];
CEList oldList = new CEList(ces, 0, len);
CEList newList = new CEList(ces,0,0);
int cp;
for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(str, i);
if (0xFF67 <= cp && cp <= 0xFF6F) {
System.out.println("debug");
}
boolean mashLast = false;
if (nfkd.normalizationDiffers(cp)) {
String decomp = nfkd.normalize(cp);
String canon = nfd.normalize(cp);
len = collator.getCEs(decomp, true, ces);
if (!decomp.equals(canon)) {
byte type = ucd.getDecompositionType(cp);
for (int j = 0; j < len; ++j) {
int p = (i == 0 && decomp.length() > 1 && decomp.charAt(0) == ' ' ? 0x20A : UCA.getPrimary(ces[j]));
int s = UCA.getSecondary(ces[j]);
boolean needsFix = (s != 0x20 && p != 0);
if (needsFix) ++len;
int t = (doMax && len > 1 && j == len-1 ? 0x1F : CEList.remap(cp, type, UCA.getTertiary(ces[j])));
if (needsFix) {
ces[j++] = UCA.makeKey(p, 0x20, t); // Set Extra
System.arraycopy(ces, j, ces, j+1, len - j); // Insert HOLE!
p = 0;
}
ces[j] = UCA.makeKey(p, s, t);
}
}
} else {
len = collator.getCEs(UTF16.valueOf(cp), true, ces);
}
CEList inc = new CEList(ces, 0, len);
if (cp == 0xFF71 || cp == 0xFF67) {
System.out.println(" String: " + ucd.getCodeAndName(cp));
System.out.println(" Type: " + ucd.getDecompositionTypeID(cp));
System.out.println(" xxx: " + inc);
}
newList = newList.append(inc);
}
if (newList.length() == 0) newList = nullCEList;
if (oldList.length() == 0) oldList = nullCEList;
if (!newList.equals(oldList)) {
/*
System.out.println("String: " + ucd.getCodeAndName(str));
System.out.println("\tOld: " + oldList);
System.out.println("\tNew: " + newList);
*/
list.add(new Pair(newList, new Pair(str, oldList)));
}
// check for collisions
if (str.equals("\u206F")) {
System.out.println("debug");
}
Object probe = newCollisions.get(newList);
if (probe == null) {
newCollisions.put(newList, str);
} else {
newProblems.put(str, new Pair((String)probe, newList));
}
probe = oldCollisions.get(oldList);
if (probe == null) {
oldCollisions.put(oldList, str);
} else {
oldProblems.put(str, new Pair((String)probe, oldList));
}
}
Set newKeys = new TreeSet(newProblems.keySet());
Set oldKeys = new TreeSet(oldProblems.keySet());
Set joint = new TreeSet(newKeys);
joint.retainAll(oldKeys);
newKeys.removeAll(joint);
oldKeys.removeAll(joint);
PrintWriter log = Utility.openPrintWriter("UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"));
Iterator it = list.iterator();
int last = -1;
while (it.hasNext()) {
Utility.dot(counter++);
Pair value = (Pair) it.next();
CEList newList = (CEList)value.first;
int cur = UCA.getPrimary(newList.at(0));
if (cur != last) {
log.println();
last = cur;
}
Pair v2 = (Pair) value.second;
String ss = (String)v2.first;
log.println(ucd.getCodeAndName(ss) + "\t\t" + ucd.getDecompositionTypeID(ss.charAt(0)));
log.println("\tnew:\t" + value.first);
log.println("\told:\t" + v2.second);
}
/*
log.println();
log.println("New Collisions: " + newKeys.size());
it = newKeys.iterator();
while (it.hasNext()) {
String key = (String) it.next();
CEList cel = (CEList) newProblems.get(key);
String other = (String) newCollisions.get(cel);
log.println(ucd.getCodeAndName(key) + " collides with " + ucd.getCodeAndName(other));
log.println("\t" + cel);
}
log.println("Removed Collisions: " + oldKeys.size());
it = oldKeys.iterator();
while (it.hasNext()) {
String key = (String) it.next();
CEList cel = (CEList) oldProblems.get(key);
String other = (String) oldCollisions.get(cel);
log.println(ucd.getCodeAndName(key) + " collides with " + ucd.getCodeAndName(other));
log.println("\t" + cel);
}
*/
showCollisions(log, "New Collisions:", newKeys, newProblems);
showCollisions(log, "Old Collisions:", oldKeys, oldProblems);
showCollisions(log, "In Both:", joint, oldProblems);
log.close();
}
static void showCollisions(PrintWriter log, String title, Set bad, Map probs) {
log.println();
log.println(title + bad.size());
Iterator it = bad.iterator();
Set lister = new TreeSet();
while (it.hasNext()) {
String key = (String) it.next();
Pair pair = (Pair) probs.get(key);
String other = (String) pair.first;
CEList cel = (CEList) pair.second;
if (key.equals("\u0001")) {
System.out.println("debug");
}
lister.add(new Pair(cel, ucd.getCodeAndName(key) + ",\t" + ucd.getCodeAndName(other)));
}
it = lister.iterator();
int last = -1;
while (it.hasNext()) {
Pair pair = (Pair) it.next();
CEList cel = (CEList) pair.first;
int curr = UCA.getPrimary(cel.at(0));
if (curr != last) {
last = curr;
log.println();
}
log.println("Collision between: " + pair.second);
log.println("\t" + pair.first);
}
log.flush();
}
}

View file

@ -0,0 +1,54 @@
package com.ibm.text.UCA;
import com.ibm.text.UCD.*;
import com.ibm.text.utility.*;
public final class RuleComparator implements java.util.Comparator {
public int compare(Object s, Object t) {
String ss = (String)s;
String tt = (String)t;
// compare just the initial portions of each level, FIRST
// only if there is a difference outside of the initial level do we stop
// we assume that there are the same number of levels!!
int si = 0;
int ti = 0;
int result = 0;
try {
while (si < ss.length() && ti < tt.length()) {
char cs = ss.charAt(si++);
char ct = tt.charAt(ti++);
if (cs == ct) continue;
/*
if (cs == 0) {
if (result == 0) result = -1;
while (ct != 0 && ti < tt.length()) {
ct = tt.charAt(ti++);
}
continue;
}
if (ct == 0) {
if (result == 0) result = 1;
while (cs != 0 && si < ss.length()) {
cs = ss.charAt(si++);
}
continue;
}
*/
if (cs < ct) return -1;
return 1;
}
} catch (StringIndexOutOfBoundsException e) {
System.out.println("WHOOPS: ");
System.out.println(si + ", " + Utility.hex(ss));
System.out.println(ti + ", " + Utility.hex(tt));
}
if (result != 0) return result;
if (ss.length() > tt.length()) return 1;
if (ss.length() < tt.length()) return -1;
return 0;
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,538 @@
package com.ibm.text.UCD;
import java.io.IOException;
//import com.ibm.text.unicode.UInfo;
import java.util.*;
import java.io.*;
//import java.text.*;
import com.ibm.text.utility.*;
public class BuildNames implements UCD_Types {
static final boolean DEBUG = true;
static UCD ucd;
public static void main(String[] args) throws IOException {
ucd = UCD.make();
collectWords();
}
static Set words = new TreeSet(new LengthFirstComparator());
static Set lines = new TreeSet(new LengthFirstComparator());
static int[] letters = new int[128];
static void stash(String word) {
words.add(word);
for (int i = 0; i < word.length(); ++i) {
letters[word.charAt(i)]++;
}
}
static String transform(String line) {
StringBuffer result = new StringBuffer();
boolean changed = false;
for (int i = 0; i < line.length(); ++i) {
char c = line.charAt(i);
if (c == '-' || c == '<' || c == '>') {
if (result.length() > 0 && result.charAt(result.length()-1) != ' ') result.append(' ');
result.append(c);
if (i + 1 < line.length() && line.charAt(i+1) != ' ') result.append(' ');
changed = true;
continue;
}
if ('a' <= c && c <= 'z') {
result.append((char)(c - 'a' + 'A'));
changed = true;
continue;
}
if ('0' <= c && c <= '9') {
result.append('*').append((char)(c - '0' + 'A'));
changed = true;
continue;
}
result.append(c);
}
if (!changed) return line;
return result.toString().trim();
}
static void collectWords() throws IOException {
System.out.println("Gathering data");
//Counter counter = new Counter();
String[] parts = new String[100];
//int total = 0;
int used = 0;
int sum = 0;
for (int i = 0; i < 0x10FFFF; ++i) {
if (ucd.hasComputableName(i)) continue;
String name = transform(ucd.getName(i));
sum += name.length();
used++;
// replace numbers & letters
int len = Utility.split(name, ' ', parts);
for (int j = 0; j < len; ++j) {
stash(parts[j]);
}
lines.add(name);
}
System.out.println("Overhead: " + (lastLink - used) + ", " + ((lastLink - used) * 100 / used) + "%");
System.out.println("Strings: " + sum + ", " + (lastLink*4));
System.out.println();
System.out.println("Compacting Words");
System.out.println();
Iterator it = words.iterator();
int i = 0;
while (it.hasNext()) {
String s = (String) it.next();
int test = CompactName.addWord(s);
String round = CompactName.stringFromToken(test);
boolean goesRound = round.equals(s);
if (false || !goesRound) System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")"
+ (goesRound ? ": NO RT: '" + round + "'" : ""));
}
System.out.println();
System.out.println("Compacting Lines");
System.out.println();
CompactName.startLines();
it = lines.iterator();
i = 0;
while (it.hasNext()) {
String s = (String) it.next();
if (s.equals("< BELL >")) {
System.out.println("DEBUG");
}
int test = CompactName.addLine(s);
String round = CompactName.stringFromToken(test);
boolean goesRound = round.equals(s);
if (false || !goesRound) System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")"
+ (!goesRound ? ": NO RT: '" + round + "'" : ""));
}
/*System.out.println("Printing Compact Forms");
for (int i = 0; i < CompactName.lastToken; ++i) {
String s = CompactName.stringFromToken(i);
System.out.println(i + ": '" + s + "'");
}*/
System.out.println("Strings: " + sum
+ ", " + (CompactName.spacedMinimum*4)
+ ", " + (CompactName.lastToken*4)
);
}
/*
Set stuff = new TreeSet();
for (int i = 0; i < letters.length; ++i) {
if (letters[i] != 0) {
stuff.add(new Integer((letters[i] << 8) + i));
}
}
it = stuff.iterator();
while (it.hasNext()) {
int in = ((Integer) it.next()).intValue();
System.out.println((char)(in & 0xFF) + ":\t" + String.valueOf(in >> 8));
}
int r = addString(name);
if (!DEBUG && !rname.equals(name)) {
System.out.println("\tNo Round Trip: '" + rname + "'");
}
*/
static Map stringToInt = new HashMap();
static Map intToString = new HashMap();
static final int[] remap = new int['Z'+1];
static final int maxToken;
static {
int counter = 1;
remap[' '] = counter++;
remap['-'] = counter++;
remap['>'] = counter++;
remap['<'] = counter++;
for (int i = 'A'; i <= 'Z'; ++i) {
remap[i] = counter++;
}
for (int i = '0'; i <= '9'; ++i) {
remap[i] = counter++;
}
maxToken = counter;
}
static final String[] unmap = new String[maxToken];
static {
unmap[0] = "";
for (int i = 0; i < remap.length; ++i) {
int x = remap[i];
if (x != 0) unmap[x] = String.valueOf((char)i);
}
}
static int[] links = new int[40000];
static final int linkStart = 0;
static int lastLink = 0;
static final int LITERAL_BOUND = 0x7FFF - maxToken * maxToken;
static boolean isLiteral(int i) {
return (i & 0x7FFF) > LITERAL_BOUND;
}
static String lookup(int i) {
String result;
boolean trailingSpace = false;
if ((i & 0x8000) != 0) {
i ^= 0x8000;
trailingSpace = true;
}
if (i > LITERAL_BOUND) {
i = i - LITERAL_BOUND;
int first = i / maxToken;
int second = i % maxToken;
result = unmap[first] + unmap[second];
} else {
int value = links[i];
int lead = value >>> 16;
int trail = value & 0xFFFF;
//if (DEBUG) System.out.println("lead: " + lead + ", trail: " + trail);
result = lookup(lead) + lookup(trail);
}
if (trailingSpace) result += ' ';
if (DEBUG) System.out.println("token: " + i + " => '" + result + "'");
return result;
}
static int getInt(String s) {
if (s.length() < 3) {
if (s.length() == 0) return 0;
int first = s.charAt(0);
int second = s.length() > 1 ? s.charAt(1) : 0;
return LITERAL_BOUND + (remap[first] * maxToken + remap[second]);
}
Object in = stringToInt.get(s);
if (in == null) return -1;
return ((Integer)in).intValue();
}
static int putString(String s, int lead, int trail) {
Object in = stringToInt.get(s);
if (in != null) throw new IllegalArgumentException();
int value = (lead << 16) + (trail & 0xFFFF);
int result = lastLink;
links[lastLink++] = value;
if (DEBUG) {
System.out.println("'" + s + "', link[" + result + "] = lead: " + lead + ", trail: " + trail);
String roundTrip = lookup(result);
if (!roundTrip.equals(s)) {
System.out.println("\t*** No Round Trip: '" + roundTrip + "'");
}
}
stringToInt.put(s, new Integer(result));
return result;
}
// s cannot have a trailing space. Must be <,>,-,SPACE,0-9,A-Z
static int addString(String s) {
int result = getInt(s);
if (result != -1) return result;
int limit = s.length() - 1;
int bestLen = 0;
int best_i = 0;
int bestSpaceLen = 0;
int bestSpace_i = 0;
int lastSpace = -1;
int spaceBits;
int endOfFirst;
// invariant. We break after a space if there is one.
for (int i = 1; i < limit; ++i) {
char c = s.charAt(i-1);
spaceBits = 0;
endOfFirst = i;
if (c == ' ') {
lastSpace = i;
endOfFirst--;
spaceBits = 0x8000;
}
String firstPart = s.substring(0, endOfFirst);
String lastPart = s.substring(i);
if (firstPart.equals("<START OF ")) {
System.out.println("HUH");
}
int lead = getInt(firstPart);
int trail = getInt(lastPart);
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
+ "' # '" + lastPart + "' MATCH BOTH");
return putString(s, spaceBits | lead, trail);
}
if (!isLiteral(lead)) {
if (i > bestLen) {
bestLen = i;
best_i = i;
}
if (i > bestSpaceLen && c == ' ') {
bestSpaceLen = i;
bestSpace_i = i + 1;
}
}
int end_i = s.length() - i;
if (!isLiteral(trail)) {
if (end_i > bestLen) {
bestLen = end_i;
best_i = i;
}
if (end_i > bestSpaceLen && c == ' ') {
bestSpaceLen = end_i;
bestSpace_i = i + 1;
}
}
}
if (lastSpace >= 0) {
bestLen = bestSpaceLen;
best_i = bestSpace_i;
}
spaceBits = 0;
if (bestLen > 0) { // if one matches, recurse -- and return pair
endOfFirst = best_i;
if (lastSpace > 0) {
--endOfFirst;
spaceBits = 0x8000;
}
String firstPart = s.substring(0, endOfFirst);
String lastPart = s.substring(best_i);
int lead = getInt(firstPart);
int trail = getInt(lastPart);
if (lead >= 0) {
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
+ "' # '" + lastPart + "' MATCH FIRST");
return putString(s, spaceBits | lead, addString(lastPart));
} else {
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
+ "' # '" + lastPart + "' MATCH SECOND");
return putString(s, spaceBits | addString(firstPart), trail);
}
}
// otherwise, we failed to find anything. Then break before the last word, if there is one
// otherwise break in the middle (but at even value)
if (lastSpace >= 0) {
best_i = lastSpace;
endOfFirst = lastSpace - 1;
spaceBits = 0x8000;
} else {
endOfFirst = best_i = ((s.length() + 1) / 4) * 2;
}
String firstPart = s.substring(0, endOfFirst);
String lastPart = s.substring(best_i);
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
+ "' # '" + lastPart + "' FALLBACK");
return putString(s, spaceBits | addString(firstPart), addString(lastPart));
}
/*
static int addCompression(String s) {
Object in = stringToInt.get(s);
if (in != null) return ((Integer) in).intValue();
// find best match, recursively
int bestBreak = -1;
boolean pickFirst = false;
for (int i = 1; i < s.length() - 1; ++i) {
char c = s.charAt(i);
if (c == ' ' || c == '-') {
Object pos1 = stringToInt.get(s.substring(0,i+1));
//Object pos23 = stringToInt.get(s..substring(i));
if (pos2 >= 0 && pos3 >= 0) {
fullToCompressed.put(value, new Integer(index + reserved));
continue main;
}
if (pos2 >= 0) {
if (k > bestBreak) {
bestBreak = k;
pickFirst = true;
}
} else if (pos3 >= 0) {
if (value.length() - k > bestBreak) {
bestBreak = k;
pickFirst = false;
}
}
}
}
}
}
static void gatherData() throws IOException {
System.out.println("Gathering data");
Counter counter = new Counter();
String[] parts = new String[100];
String[] parts2 = new String[100];
int total = 0;
for (int i = 0; i < 0x10FFFF; ++i) {
//if ((i & 0xFF) == 0) System.out.println(Utility.hex(i));
if (!ucd.isRepresented(i)) continue;
String s = ucd.getName(i);
total += s.length();
int len = Utility.split(s, ' ', parts);
for (int j = 0; j < len; ++j) {
if (parts[j].indexOf('-') >= 0) {
// hyphen stuff
int len2 = Utility.split(parts[j], '-', parts2);
for (int k = 0; k < len2; ++k) {
if (k == len2 - 1) {
counter.add(parts2[k] + '-');
} else {
counter.add(parts2[k] + " ");
}
}
} else {
// normal
counter.add(parts[j] + " ");
}
}
}
System.out.println("Sorting data");
Map m = counter.extract();
System.out.println("Printing data");
PrintWriter log = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(GEN_DIR + "NameCompression.txt")),
32*1024));
log.println("total: " + total);
Iterator it = m.keySet().iterator();
String mondo = "";
int i = 0;
int strTotal = 0;
int index = 0;
Map fullToCompressed = new HashMap();
String mondoIndex = "";
main:
while (it.hasNext()) {
index++;
if ((i & 255) == 0) System.out.println("#" + i);
Counter.RWInteger key = (Counter.RWInteger) it.next();
String value = (String)m.get(key);
log.println(i++ + ": " + key + ": \"" + value + "\"");
strTotal += value.length();
// first 128 are the highest frequency, inc. space
if (index < 128 - SINGLES) {
mondo += value;
fullToCompressed.put(value, new String((char)(index + reserved)));
continue;
}
int pos = mondo.indexOf(value);
if (pos >= 0) {
// try splitting!
int bestBreak = -1;
boolean pickFirst = false;
if (value.length() > 2) for (int k = 1; k < value.length()-1; ++k) {
int pos2 = mondo.indexOf(value.substring(0,k) + " ");
int pos3 = mondo.indexOf(value.substring(k));
if (pos2 >= 0 && pos3 >= 0) {
fullToCompressed.put(value, new Integer(index + reserved));
continue main;
}
if (pos2 >= 0) {
if (k > bestBreak) {
bestBreak = k;
pickFirst = true;
}
} else if (pos3 >= 0) {
if (value.length() - k > bestBreak) {
bestBreak = k;
pickFirst = false;
}
}
}
if (bestBreak > 0) {
if (pickFirst) {
mondo += value.substring(bestBreak);
} else {
mondo += value.substring(0, bestBreak) + " ";
}
} else {
mondo += value;
}
}
// high bit on, means 2 bytes, look in array
}
log.println("strTotal: " + strTotal);
log.println("mondo: " + mondo.length());
int k = 80;
for (; k < mondo.length(); k += 80) {
log.println(mondo.substring(k-80, k));
}
log.println(mondo.substring(k-80)); // last line
log.close();
}
static int indexOf(StringBuffer target, String source) {
int targetLen = target.length() - source.length();
main:
for (int i = 0; i <= targetLen; ++i) {
for (int j = 0; j < source.length(); ++j) {
if (target.charAt(i) != source.charAt(j)) continue main;
}
return i;
}
return -1;
}
static final int SINGLES = 26 + 10 + 2;
*/
/*
static String decode(int x) {
if (x < SINGLES) {
if (x < 26) return String.valueOf(x + 'A');
if (x < 36) return String.valueOf(x - 26 + '0');
if (x == 36) return "-";
return " ";
}
if (x < binaryLimit) {
x =
*/
}

View file

@ -0,0 +1,260 @@
package com.ibm.text.UCD;
import java.io.IOException;
import java.util.*;
import java.io.*;
import java.text.*;
public class CompactName {
static final boolean DEBUG = false;
public static void main(String[] args) throws IOException {
int test = tokenFromString("ABZ");
String ss = stringFromToken(test);
System.out.println(ss);
CompactName.addWord("ABSOLUTEISM");
for (int i = 0; i < CompactName.lastToken; ++i) {
String s = CompactName.stringFromToken(i);
System.out.println(s);
}
}
static final char[] compactMap = new char[128];
static final char[] compactUnmap = new char[128];
static {
char counter = 0;
compactMap[0] = counter++;
for (int i = 'A'; i <= 'Z'; ++i) {
compactMap[i] = counter++;
}
compactMap['-'] = counter++;
compactMap['>'] = counter++;
compactMap['<'] = counter++;
compactMap['*'] = counter++;
compactUnmap[0] = 0;
for (char i = 0; i < compactUnmap.length; ++i) {
int x = compactMap[i];
if (x != 0) compactUnmap[x] = i;
}
}
/*
static String expand(String s) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < s.length(); ++i) {
int m = s.charAt(i);
if (m == 31 && i < s.length() + 1) {
m = 31 + s.charAt(++i);
}
result.append(compactUnmap[m]);
}
return result.toString();
}
static String compact(String s) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < s.length(); ++i) {
int m = compactMap[s.charAt(i)];
if (m >= 31) {
result.append((char)31);
m -= 31;
}
result.append(m);
}
return result.toString();
}
*/
static Map string_token = new HashMap();
static Map token_string = new HashMap();
static int[] tokenList = new int[40000];
static final int tokenStart = 0;
static int lastToken = 0;
static int spacedMinimum = Integer.MAX_VALUE;
static boolean isLiteral(int i) {
return (i & 0x8000) != 0;
}
static int addTokenForString(String s, int lead, int trail) {
Object in = string_token.get(s);
if (in != null) throw new IllegalArgumentException();
int value = (lead << 16) + (trail & 0xFFFF);
int result = lastToken;
tokenList[lastToken++] = value;
if (DEBUG) {
System.out.println("'" + s + "', tokenList[" + result + "] = lead: " + lead + ", trail: " + trail);
String roundTrip = stringFromToken(result);
if (!roundTrip.equals(s)) {
System.out.println("\t*** No Round Trip: '" + roundTrip + "'");
}
}
string_token.put(s, new Integer(result));
return result;
}
static String stringFromToken(int i) {
String result;
if ((i & 0x8000) != 0) {
char first = compactUnmap[(i >> 10) & 0x1F];
char second = compactUnmap[(i >> 5) & 0x1F];
char third = compactUnmap[i & 0x1F];
result = String.valueOf(first);
if (second != 0) result += String.valueOf(second);
if (third != 0) result += String.valueOf(third);
} else if (i > lastToken) {
throw new IllegalArgumentException("bad token: " + i);
} else {
int value = tokenList[i];
int lead = value >>> 16;
int trail = value & 0xFFFF;
if (i >= spacedMinimum) result = stringFromToken(lead) + ' ' + stringFromToken(trail);
else result = stringFromToken(lead) + stringFromToken(trail);
}
if (DEBUG) System.out.println("token: " + i + " => '" + result + "'");
return result;
}
static int tokenFromString(String s) {
if (s.length() <= 3) {
int first = compactMap[s.charAt(0)];
int second = compactMap[s.length() > 1 ? s.charAt(1) : 0];
int third = compactMap[s.length() > 2 ? s.charAt(2) : 0];
return 0x8000 + (first << 10) + (second << 5) + third;
}
Object in = string_token.get(s);
if (in == null) return -1;
return ((Integer)in).intValue();
}
static int addWord(String s) {
int result = tokenFromString(s);
if (result != -1) return result;
int bestLen = 0;
int best_i = 0;
int limit = s.length() - 1;
for (int i = limit; i >= 1; --i) {
String firstPart = s.substring(0, i);
String lastPart = s.substring(i);
int lead = tokenFromString(firstPart);
int trail = tokenFromString(lastPart);
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
if (DEBUG) show(s, firstPart, lastPart, "MATCH BOTH");
return addTokenForString(s, lead, trail);
}
if (!isLiteral(lead)) {
if (i > bestLen) {
bestLen = i;
best_i = i;
}
}
if (!isLiteral(trail)) {
int end_i = s.length() - i;
if (end_i > bestLen) {
bestLen = end_i;
best_i = i;
}
}
}
if (bestLen > 0) { // if one matches, recurse -- and return pair
String firstPart = s.substring(0, best_i);
String lastPart = s.substring(best_i);
int lead = tokenFromString(firstPart);
int trail = tokenFromString(lastPart);
if (lead >= 0) {
if (DEBUG) show(s, firstPart, lastPart, "MATCH FIRST");
return addTokenForString(s, lead, addWord(lastPart));
} else {
if (DEBUG) show(s, firstPart, lastPart, "MATCH SECOND");
return addTokenForString(s, addWord(firstPart), trail);
}
}
// break at multiple of 3
best_i = ((s.length() + 1) / 6) * 3;
String firstPart = s.substring(0, best_i);
String lastPart = s.substring(best_i);
if (DEBUG) show(s, firstPart, lastPart, "Fallback");
return addTokenForString(s, addWord(firstPart), addWord(lastPart));
}
static void show(String s, String firstPart, String lastPart, String comment) {
System.out.println((s) + " => '" + (firstPart)
+ "' # '" + (lastPart) + "' " + comment);
}
static void startLines() {
spacedMinimum = lastToken;
}
static int addLine(String s) {
int result = tokenFromString(s);
if (result != -1) return result;
int bestLen = 0;
int best_i = 0;
int limit = s.length() - 2;
for (int i = limit; i >= 1; --i) {
char c = s.charAt(i);
if (c != ' ') continue;
String firstPart = s.substring(0, i);
String lastPart = s.substring(i+1);
int lead = tokenFromString(firstPart);
int trail = tokenFromString(lastPart);
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
if (DEBUG) show(s, firstPart, lastPart, "MATCH BOTH");
return addTokenForString(s, lead, trail);
}
if (i > bestLen) {
bestLen = i;
best_i = i;
}
int end_i = s.length() - i - 1;
if (end_i > bestLen) {
bestLen = end_i;
best_i = i;
}
}
if (bestLen > 0) { // if one matches, recurse -- and return pair
String firstPart = s.substring(0, best_i);
String lastPart = s.substring(best_i + 1);
int lead = tokenFromString(firstPart);
int trail = tokenFromString(lastPart);
if (lead >= 0) {
if (DEBUG) show(s, firstPart, lastPart, "MATCH FIRST");
return addTokenForString(s, lead, addLine(lastPart));
} else {
if (DEBUG) show(s, firstPart, lastPart, "MATCH SECOND");
return addTokenForString(s, addLine(firstPart), trail);
}
}
System.out.println("SHOULD HAVE MATCHED!!");
throw new IllegalArgumentException("SHOULD HAVE MATCHED!! " + s);
}
}

View file

@ -0,0 +1,831 @@
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import java.util.*;
import java.text.NumberFormat;
import java.io.*;
/** Simple program to merge UCD files into XML. Not yet documented!!
* @author Mark Davis
*/
public final class ConvertUCD implements UCD_Types {
public static final boolean SHOW = true;
public static final boolean DEBUG = false;
public static int major;
public static int minor;
public static int update;
static String version;
// varies by version
/*
public static final String BASE_DIR11 = DATA_DIR + "\\Versions\\";
public static final String BASE_DIR20 = DATA_DIR + "\\Versions\\";
public static final String BASE_DIR21 = DATA_DIR + "\\Versions\\";
public static final String BASE_DIR30 = DATA_DIR + "\\Update 3.0.1\\";
public static final String BASE_DIR31 = DATA_DIR + "\\3.1-Update\\";
*/
//public static final String blocksnamePlain = "Blocks.txt";
//public static final String blocksname31 = "Blocks-4d2.beta";
/** First item is file name, rest are field names (skipping character).
* "OMIT" is special -- means don't record
*/
static String[][] labelList = {
// Labels for the incoming files. Labels MUST match field order in file.
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
// The one exception is "st", which is handled specially.
// So file order is important.
//*
// 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; <compat> 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
{"UnicodeData", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
{"ExtraProperties", "xp"},
{"PropList", "binary"},
//{"ExtraProperties", "xp"},
{"EastAsianWidth", "ea", "OMIT"},
{"LineBreak", "lb", "OMIT"},
{"SpecialCasing", "*sl", "*st", "*su", "sc"},
{"CompositionExclusions", "ce"},
{"CaseFolding", "OMIT", "*fc"},
{"ArabicShaping", "OMIT", "jt", "jg"},
{"BidiMirroring", "*bg"},
{"Scripts", "sn"},
//{"Jamo", "jn"},
//{"Scripts-1d4", "RANGE", "sn"},
//{"Age", "*sn"},
//*/
/*
//*/
};
/*
static String[][] labelList31 = {
// Labels for the incoming files. Labels MUST match field order in file.
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
// The one exception is "st", which is handled specially.
// So file order is important.
//*
// 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; <compat> 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
{"UnicodeData-3.1.0d8.beta", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
{"PropList-3.1.0d5.beta", "binary"},
{"ExtraProperties", "xp"},
{"EastAsianWidth-4d7.beta", "ea", "OMIT"},
{"LineBreak-6d6.beta", "lb", "OMIT"},
{"SpecialCasing-4d1.beta", "*sl", "*st", "*su", "sc"},
{"CompositionExclusions-3d6.beta", "ce"},
{"CaseFolding-3d4.beta", "OMIT", "*fc"},
{"ArabicShaping", "OMIT", "jt", "jg"},
{"BidiMirroring", "*bg"},
{"Scripts-3.1.0d4.beta", "sn"},
//{"Scripts-1d4", "RANGE", "sn"},
//{"Age", "*sn"},
//*/
/*
{"Jamo", "jn"},
//
};
/*
{"UnicodeData-3.1.0d8.beta", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
{"ExtraProperties", "xp"},
{"EastAsianWidth-4d7.beta", "ea", "OMIT"},
{"LineBreak-6d6.beta", "lb", "OMIT"},
{"SpecialCasing-4d1.beta", "*sl", "*st", "*su", "sc"},
{"CompositionExclusions-3d6.beta", "ce"},
{"CaseFolding-3d4.beta", "OMIT", "*fc"},
{"PropList-3.1.0d2.beta", "PROP", "OMIT"},
{"ArabicShaping", "OMIT", "jt", "jg"},
{"BidiMirroring", "*bg"},
{"Scripts-1d4", "sn"},
//{"Scripts-1d4", "RANGE", "sn"},
//{"Age", "*sn"},
//*/
/*
{"Jamo", "jn"},
//
//"NamesList-3.1.0d1.beta"
static String[][] labelList30 = {
// Labels for the incoming files. Labels MUST match field order in file.
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
// The one exception is "st", which is handled specially.
// So file order is important.
//*
{"UnicodeData", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
{"CompositionExclusions", "ce"},
{"EastAsianWidth", "ea", "OMIT"},
{"LineBreak", "lb", "OMIT"},
{"SpecialCasing", "*sl", "*st", "*su", "sc"},
{"CaseFolding", "OMIT", "*fc"},
{"ArabicShaping", "OMIT", "jt", "jg"},
{"BidiMirroring", "*bg"},
/*
{"Jamo", "jn"},
{"PropList.alpha", "RANGE", "OMIT"},
//
};
static String[][] labelList11 = {
{"UnicodeData-1.1", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
};
static String[][] labelList20 = {
{"UnicodeData-2.0", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
};
static String[][] labelList21 = {
{"UnicodeData-2.1", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
};
*/
// handles
public static final String blocksname = "Blocks";
//public static final String[][] labelList;
public static final boolean NEWPROPS = true;
/*
static {
switch (major*10 + minor) {
case 31:
blocksname = blocksname31;
labelList = labelList31;
break;
case 30:
blocksname = blocksnamePlain;
labelList = labelList30;
break;
case 21:
blocksname = blocksnamePlain;
labelList = labelList21;
break;
case 20:
blocksname = blocksnamePlain;
labelList = labelList20;
break;
default:
blocksname = blocksnamePlain;
labelList = labelList11;
break;
}
}
*/
static final String dataFilePrefix = "UCD_Data";
// MAIN!!
public static void main (String[] args) throws Exception {
System.out.println("ConvertUCD");
log = new PrintWriter(new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(GEN_DIR + "UCD-log.txt"),
"UTF8"),
32*1024));
log.write("\uFEFF"); // BOM
try {
for (int i = 0; i < args.length; ++i) {
version = args[i];
if (version.length() == 0) version = UCD.latestVersion;
String[] parts = new String[3];
Utility.split(version, '.', parts);
major = Integer.parseInt(parts[0]);
minor = Integer.parseInt(parts[1]);
update = Integer.parseInt(parts[2]);
toJava();
}
} finally {
log.close();
}
}
/*
static void toXML() throws Exception {
// Blocks is special
// Unihan is special
// collect all the other .txt files in the directory
if (false) readBlocks();
if (true) for (int i = 0; i < labelList.length; ++i) {
readSemi(labelList[i]);
} else {
readSemi(labelList[0]); // TESTING ONLY
}
writeXML();
}
*/
static void toJava() throws Exception {
// Blocks is special
// Unihan is special
// collect all the other .txt files in the directory
if (false) readBlocks();
if (true) for (int i = 0; i < labelList.length; ++i) {
readSemi(labelList[i]);
} else {
readSemi(labelList[0]); // TESTING ONLY
}
Iterator it = charData.keySet().iterator();
while (it.hasNext()) {
Object key = it.next();
UData value = (UData) charData.get(key);
value.compact();
}
UData ud = getEntry(0x2A6D6);
System.out.println("SPOT-CHECK: 2A6D6: " + ud);
ud = getEntry(0xFFFF);
System.out.println("SPOT-CHECK: FFFF: " + ud);
writeJavaData();
}
static PrintWriter log;
//static String directory = BASE_DIR;
//static Map appendDuplicates = new HashMap();
/** First item in labels is file name, rest are field names (skipping character).
* "OMIT" is special -- means don't record
*/
static HashMap isHex = new HashMap();
static HashMap defaults = new HashMap();
static {
for (int j = 0; j < labelList.length; ++j) {
String[] labels = labelList[j];
for (int i = 1; i < labels.length; ++i) {
boolean hex = false;
String def = null;
//char appendChar = '\u0000';
// pull off "*": hex interpretation
if (labels[i].charAt(0) == '*') { // HEX value
hex = true;
labels[i] = labels[i].substring(1);
}
/*
// pull off "$": append duplicates
if (labels[i].charAt(0) == '$') { // HEX value
appendChar = labels[i].charAt(1);
labels[i] = labels[i].substring(2);
}
// pull off default values
int pos = labels[i].indexOf('-');
if (pos >= 0) {
def = labels[i].substring(pos+1);
labels[i] = labels[i].substring(0,pos);
}
*/
// store results
// we do this after all processing, so that the label is clean!!
if (hex) isHex.put(labels[i], "");
//if (appendChar != 0) appendDuplicates.put(labels[i], String.valueOf(appendChar));
defaults.put(labels[i], def);
}
}
}
static List blockData = new LinkedList();
static void readBlocks() throws Exception {
System.out.println("Reading 'Blocks'");
BufferedReader input = Utility.openUnicodeFile(blocksname, version);
String line = "";
try {
String[] parts = new String[20];
for (int lineNumber = 1; ; ++lineNumber) {
line = input.readLine();
if (line == null) break;
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
//String original = line;
String comment = "";
int commentPos = line.indexOf('#');
if (commentPos >= 0) {
comment = line.substring(commentPos+1).trim();
line = line.substring(0, commentPos);
}
line = line.trim();
if (line.length() == 0) continue;
int count = Utility.split(line,';',parts);
if (count != 3) throw new ChainException("Bad count in Blocks", null);
blockData.add(new String[] {Utility.fromHex(parts[0]), Utility.fromHex(parts[1]), parts[2].trim()});
}
} catch (Exception e) {
System.out.println("Exception at: " + line);
throw e;
} finally {
input.close();
}
}
static Set properties = new TreeSet();
static void readSemi(String[] labels) throws Exception {
System.out.println();
System.out.println("Reading '" + labels[0] + "'");
if (major < 3 || (major == 3 && minor < 1)) {
if (labels[0] == "PropList") {
System.out.println("SKIPPING old format of Proplist for " + version);
return;
}
}
String tempVersion = version;
if (version.equals(UCD.latestVersion)) tempVersion = "";
BufferedReader input = Utility.openUnicodeFile(labels[0], tempVersion);
if (input == null) {
System.out.println("COULDN'T OPEN: " + labels[0]);
return;
}
boolean showedSemi = false;
boolean showedShort = false;
String line = "";
try {
String[] parts = new String[20];
for (int lineNumber = 1; ; ++lineNumber) {
line = input.readLine();
if (line == null) break;
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
String original = line;
String comment = "";
int commentPos = line.indexOf('#');
if (commentPos >= 0) {
comment = line.substring(commentPos+1).trim();
line = line.substring(0, commentPos);
}
line = line.trim();
if (line.length() == 0) continue;
int count = Utility.split(line,';',parts);
if (parts[0].equals("2801")) {
System.out.println("debug?");
}
// fix malformed or simple lists.
if (count != labels.length) {
if (count == labels.length + 1 && parts[count-1].equals("")) {
if (!showedSemi) System.out.println("Extra semicolon in: " + original);
showedSemi = true;
} else if (count == 1) { // fix simple list
++count;
parts[1] = "Y";
} else if (count < labels.length) {
if (!showedShort) System.out.println("Line shorter than labels: " + original);
showedShort = true;
for (int i = count; i < labels.length; ++i) {
parts[i] = "";
}
} else {
throw new ChainException("wrong count: {0}",
new Object[] {new Integer(line), new Integer(count)});
}
}
// store char
// first field is always character OR range. May be UTF-32
int cpTop;
int cpStart;
int ddot = parts[0].indexOf(".");
if (ddot >= 0) {
cpStart = UTF32.char32At(Utility.fromHex(parts[0].substring(0,ddot)),0);
cpTop = UTF32.char32At(Utility.fromHex(parts[0].substring(ddot+2)),0);
System.out.println(Utility.hex(cpStart) + " ... " + Utility.hex(cpTop));
} else {
cpStart = UTF32.char32At(Utility.fromHex(parts[0]),0);
cpTop = cpStart;
if (labels[1].equals("RANGE")) UTF32.char32At(Utility.fromHex(parts[1]),0);
}
// properties first
if (labels[1].equals("PROP")) {
String prop = parts[2].trim();
// FIX!!
boolean skipLetters = false;
if (prop.equals("Alphabetic")) {
prop = "Other_Alphabetic";
skipLetters = true;
}
// END FIX!!
properties.add(prop);
if (Utility.find(prop, UCD_Names.DeletedProperties) == -1) { // only undeleted
int end = UTF32.char32At(Utility.fromHex(parts[1]),0);
if (end == 0) end = cpStart;
for (int j = cpStart; j <= end; ++j) {
if (j != UCD.mapToRepresentative(j, false)) continue;
if (skipLetters && getEntry(cpStart).isLetter()) continue;
appendCharProperties(j, prop);
}
}
} else { // not range!
String val = "";
String lastVal;
for (int i = 1; i < labels.length; ++i) {
String key = labels[i];
lastVal = val;
if (isHex.get(key) != null) {
val = Utility.fromHex(parts[i]);
} else {
val = parts[i].trim();
}
if (key.equals("OMIT")) continue; // do after val, so lastVal is correct
if (key.equals("RANGE")) continue; // do after val, so lastVal is correct
if (val.equals("")) continue; // skip empty values, they mean default
for (int cps = cpStart; cps <= cpTop; ++cps) {
if (UCD.mapToRepresentative(cps, false) != cps) continue; // skip condensed ranges
if (key.equals("binary")) {
appendCharProperties(cps, val);
} else if (key.equals("fc")) {
UData data = getEntry(cps);
String type = parts[i-1].trim();
if (type.equals("F") || type.equals("C") || type.equals("E") || type.equals("L")) {
data.fullCaseFolding = val;
//System.out.println("*<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
}
if (type.equals("S") || type.equals("C") || type.equals("L")) {
data.simpleCaseFolding = val;
//System.out.println("<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
}
if (type.equals("I")) {
data.simpleCaseFolding = val;
setBinaryProperty(cps, CaseFoldTurkishI);
System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
}
} else {
/*if (key.equals("sn")) { // SKIP UNDEFINED!!
UData data = getEntryIfExists(cps);
if (data == null || data.generalCategory == Cn) continue;
}
*/
addCharData(cps, key, val);
}
}
}
}
}
} catch (Exception e) {
System.out.println("Exception at: " + line + ", " + e.getMessage());
throw e;
} finally {
input.close();
}
//printValues("JOINING_TYPE", jtSet);
//printValues("JOINING_GROUP", jgSet);
}
static void printValues(String title, Set s) {
Iterator it = s.iterator();
System.out.println("public static String[] " + title + " = {");
while (it.hasNext()) {
String value = (String) it.next();
System.out.println(" \"" + value + "\",");
}
System.out.println("};");
it = s.iterator();
System.out.println("public static byte ");
int count = 0;
while (it.hasNext()) {
String value = (String) it.next();
System.out.println(" " + value.replace(' ', '-').toUpperCase() + " = " + (count++) + ",");
}
System.out.println(" LIMIT_" + title + " = " + count);
System.out.println(";");
}
static Map charData = new TreeMap();
static void writeXML() throws IOException {
System.out.println("Writing 'UCD-Main.xml'");
BufferedWriter output = new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(UCD.BIN_DIR + "UCD_Data.xml"),
"UTF8"),
32*1024);
try {
// write header
output.write("<?xml version='1.0' encoding='utf-8'?>\r\n");
output.write("<UnicodeCharacterDatabase>\r\n");
output.write(" <!-- IMPORTANT: see UCD-Notes.html for information on the format. This file CANNOT be read correctly without that information. -->\r\n");
output.write(" <unicode version='" + major + "' minor='" + minor + "' update='" + update + "'/>\r\n");
output.write(" <fileVersion status='DRAFT' date='" + new Date() + "'/>\r\n");
// write blocks
Iterator it = blockData.iterator();
while (it.hasNext()) {
String[] block = (String[]) it.next();
output.write(" <block start='" + Utility.quoteXML(block[0])
+ "' end='" + Utility.quoteXML(block[1])
+ "' name='" + Utility.quoteXML(block[2])
+ "'/>\r\n" );
}
// write char data
it = charData.keySet().iterator();
while (it.hasNext()) {
Integer cc = (Integer) it.next();
output.write(" <e c='" + Utility.quoteXML(cc.intValue()) + "'" );
/*
UData data = (UData) charData.get(cc);
Iterator dataIt = data.keySet().iterator();
while (dataIt.hasNext()) {
String label = (String) dataIt.next();
if (label.equals("c")) continue; // already wrote it.
if (label.equals("fc")) {
String fc = getResolved(data, "fc");
String lc = getResolved(data, "lc");
if (!fc.equals(lc) && !lc.equals(cc)) log.println("FC " + fc.length() + ": " + toString(cc));
}
String value = Utility.quoteXML((String) data.get(label));
output.write(" " + label + "='" + value + "'");
}
*/
output.write("/>\r\n");
}
// write footer
output.write("</UnicodeCharacterDatabase>\r\n");
} finally {
output.close();
}
}
static void writeJavaData() throws IOException {
Iterator it = charData.keySet().iterator();
int codePoint = -1;
System.out.println("Writing " + dataFilePrefix + version);
DataOutputStream dataOut = new DataOutputStream(
new BufferedOutputStream(
new FileOutputStream(UCD.BIN_DIR + dataFilePrefix + version + ".bin"),
128*1024));
// write header
dataOut.writeByte(BINARY_FORMAT);
dataOut.writeByte(major);
dataOut.writeByte(minor);
dataOut.writeByte(update);
long millis = System.currentTimeMillis();
dataOut.writeLong(millis);
dataOut.writeInt(charData.size());
System.out.println("Data Size: " + NumberFormat.getInstance().format(charData.size()));
int count = 0;
// write records
try {
// write char data
while (it.hasNext()) {
Object cc = (Object) it.next();
//codePoint = UTF32.char32At(cc,0);
if (DEBUG) System.out.println(Utility.hex(cc));
UData uData = (UData) charData.get(cc);
if (false && uData.name == null) {
System.out.println("Warning: NULL name\r\n" + uData);
System.out.println();
}
if (uData.codePoint == 0x2801) {
System.out.println("SPOT-CHECK: " + uData);
}
uData.writeBytes(dataOut);
count++;
if (DEBUG) System.out.println("Setting2");
}
System.out.println("Wrote Data " + count);
} catch (Exception e) {
throw new ChainException("Bad data write {0}", new Object [] {Utility.hex(codePoint)}, e);
} finally {
dataOut.close();
}
}
static String[] xsSplit = new String[40];
// Cache a little bit for speed
static int getEntryCodePoint = -1;
static UData getEntryUData = null;
static UData getEntryIfExists(int cp) {
if (cp == getEntryCodePoint) return getEntryUData;
Integer cc = new Integer(cp);
UData charEntry = (UData) charData.get(cc);
if (charEntry == null) return null;
getEntryCodePoint = cp;
getEntryUData = charEntry;
return charEntry;
}
/* Get entry in table for cc
*/
static UData getEntry(int cp) {
if (cp == getEntryCodePoint) return getEntryUData;
Integer cc = new Integer(cp);
UData charEntry = (UData) charData.get(cc);
if (charEntry == null) {
charEntry = new UData(cp);
charData.put(cc, charEntry);
//charEntry.put("c", cc);
}
getEntryCodePoint = cp;
getEntryUData = charEntry;
return charEntry;
}
/** Adds the character data. Signals duplicates with an exception
*/
static void setBinaryProperty(int cp, int binProp) {
UData charEntry = getEntry(cp);
charEntry.binaryProperties |= (1 << binProp);
}
static void appendCharProperties(int cp, String key) {
int ind;
//if (true || NEWPROPS) {
ind = Utility.lookup(key, UCD_Names.BP);
/*} else {
ind = Utility.lookup(key, UCD_Names.BP_OLD);
}
*/
//charEntry.binaryProperties |= (1 << ind);
setBinaryProperty(cp, ind);
}
static Set jtSet = new TreeSet();
static Set jgSet = new TreeSet();
/** Adds the character data. Signals duplicates with an exception
*/
static void addCharData(int cp, String key, String value) {
//if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + Utility.quoteJavaString(value));
UData charEntry = getEntry(cp);
//if (cp < 10) System.out.println(" " + charEntry);
if (key.equals("bm")) {
if (value.equals("Y")) charEntry.binaryProperties |= 1;
} else if (key.equals("ce")) {
charEntry.binaryProperties |= 2;
} else if (key.equals("on")) {
if (charEntry.name.charAt(0) == '<') {
charEntry.name = '<' + value + '>';
}
} else if (key.equals("dm")) {
charEntry.decompositionType = CANONICAL;
if (value.charAt(0) == '<') {
int pos = value.indexOf('>');
String dType = value.substring(1,pos);
if (major < 2) if (dType.charAt(0) == '+') dType = dType.substring(1);
value = value.substring(pos+1);
setField(charEntry, "dt", dType);
}
// FIX OLD
if (major < 2) {
int oldStyle = value.indexOf('<');
if (oldStyle > 0) {
value = value.substring(0,oldStyle);
}
oldStyle = value.indexOf('{');
if (oldStyle > 0) {
value = value.substring(0,oldStyle);
}
}
setField(charEntry, key, Utility.fromHex(value));
// fix the numeric fields to be more sensible
} else if (key.equals("dd")) {
if (charEntry.numericType < UCD_Types.DECIMAL) {
charEntry.numericType = UCD_Types.DECIMAL;
}
setField(charEntry, "nv", value);
} else if (key.equals("dv")) {
if (charEntry.numericType < UCD_Types.DIGIT) {
charEntry.numericType = UCD_Types.DIGIT;
}
setField(charEntry, "nv", value);
} else if (key.equals("nv")) {
if (charEntry.numericType < UCD_Types.NUMERIC) {
charEntry.numericType = UCD_Types.NUMERIC;
}
setField(charEntry, "nv", value);
/*} else if (key.equals("jt")) {
jtSet.add(value);
} else if (key.equals("jg")) {
jgSet.add(value);
*/
} else {
setField(charEntry, key, value);
}
}
static public void setField(UData uData, String fieldName, String fieldValue) {
try {
if (fieldName.equals("n")) {
uData.name = fieldValue;
} else if (fieldName.equals("dm")) {
uData.decompositionMapping = fieldValue;
} else if (fieldName.equals("bg")) {
uData.bidiMirror = fieldValue;
} else if (fieldName.equals("uc")) {
uData.simpleUppercase = fieldValue;
} else if (fieldName.equals("lc")) {
uData.simpleLowercase = fieldValue;
} else if (fieldName.equals("tc")) {
uData.simpleTitlecase = fieldValue;
} else if (fieldName.equals("su")) {
uData.fullUppercase = fieldValue;
} else if (fieldName.equals("sl")) {
uData.fullLowercase = fieldValue;
} else if (fieldName.equals("st")) {
uData.fullTitlecase = fieldValue;
} else if (fieldName.equals("sc")) {
uData.specialCasing = fieldValue;
} else if (fieldName.equals("xp")) {
uData.binaryProperties |= 1 << Utility.lookup(fieldValue, UCD_Names.BP);
//UCD_Names.BP_OLD
} else if (fieldName.equals("gc")) {
uData.generalCategory = Utility.lookup(fieldValue, UCD_Names.GC);
} else if (fieldName.equals("bc")) {
uData.bidiClass = Utility.lookup(fieldValue, UCD_Names.BC);
} else if (fieldName.equals("dt")) {
if (major < 2) {
if (fieldValue.equals("no-break")) fieldValue = "noBreak";
else if (fieldValue.equals("circled")) fieldValue = "circle";
else if (fieldValue.equals("sup")) fieldValue = "super";
else if (fieldValue.equals("break")) fieldValue = "compat";
else if (fieldValue.equals("font variant")) fieldValue = "font";
else if (fieldValue.equals("no-join")) fieldValue = "compat";
else if (fieldValue.equals("join")) fieldValue = "compat";
}
uData.decompositionType = Utility.lookup(fieldValue, UCD_Names.DT);
} else if (fieldName.equals("nt")) {
uData.numericType = Utility.lookup(fieldValue, UCD_Names.NT);
} else if (fieldName.equals("ea")) {
uData.eastAsianWidth = Utility.lookup(fieldValue, UCD_Names.EA);
} else if (fieldName.equals("lb")) {
uData.lineBreak = Utility.lookup(fieldValue, UCD_Names.LB);
} else if (fieldName.equals("sn")) {
uData.script = Utility.lookup(fieldValue, UCD_Names.SCRIPT);
} else if (fieldName.equals("jt")) {
uData.joiningType = Utility.lookup(fieldValue, UCD_Names.JOINING_TYPE);
} else if (fieldName.equals("jg")) {
uData.joiningGroup = Utility.lookup(fieldValue, UCD_Names.OLD_JOINING_GROUP);
} else if (fieldName.equals("nv")) {
if (major < 2) {
if (fieldValue.equals("-")) return;
}
uData.numericValue = Utility.floatFrom(fieldValue);
} else if (fieldName.equals("cc")) {
uData.combiningClass = (byte)Utility.intFrom(fieldValue);
} else if (fieldName.equals("bp")) {
uData.binaryProperties = (byte)Utility.intFrom(fieldValue);
} else {
throw new IllegalArgumentException("Unknown fieldName");
}
} catch (Exception e) {
throw new ChainException(
"Bad field name= \"{0}\", value= \"{1}\"", new Object[] {fieldName, fieldValue}, e);
}
}
}

View file

@ -0,0 +1,440 @@
package com.ibm.text.UCD;
import java.io.*;
import java.util.*;
import com.ibm.text.utility.*;
final class DerivedPropertyLister extends PropertyLister {
static final boolean BRIDGE = false;
static int enum = 0;
static final int
PropMath = 0,
PropAlphabetic = 1,
PropLowercase = 2,
PropUppercase = 3,
ID_Start = 4,
ID_Continue_NO_Cf = 5,
Mod_ID_Start = 6,
Mod_ID_Continue_NO_Cf = 7,
Missing_Uppercase = 8,
Missing_Lowercase = 9,
Missing_Mixedcase = 10,
FC_NFKC_Closure = 11,
FullCompExclusion = 12,
FullCompInclusion = 13,
QuickNFD = 14,
QuickNFC = 15,
QuickNFKD = 16,
QuickNFKC = 17,
ExpandsOnNFD = 18,
ExpandsOnNFC = 19,
ExpandsOnNFKD = 20,
ExpandsOnNFKC = 21,
GenNFD = 22,
GenNFC = 23,
GenNFKD = 24,
GenNFKC = 25,
LIMIT = 26;
;
private int propMask;
private Normalizer[] nf = new Normalizer[4];
private Normalizer nfd, nfc, nfkd, nfkc;
int width;
public DerivedPropertyLister(UCD ucd, int propMask, PrintStream output) {
this.propMask = propMask;
this.output = output;
this.ucdData = ucd;
nfd = nf[0] = new Normalizer(Normalizer.NFD);
nfc = nf[1] = new Normalizer(Normalizer.NFC);
nfkd = nf[2] = new Normalizer(Normalizer.NFKD);
nfkc = nf[3] = new Normalizer(Normalizer.NFKC);
width = super.minPropertyWidth();
switch (propMask) {
case GenNFD: case GenNFC: case GenNFKD: case GenNFKC:
alwaysBreaks = true;
break;
case FC_NFKC_Closure:
alwaysBreaks = true;
width = 21;
break;
case QuickNFC: case QuickNFKC:
width = 11;
break;
}
}
public String headerString() {
String result = "# Derived Property: ";
switch (propMask) {
case ExpandsOnNFD: case ExpandsOnNFC: case ExpandsOnNFKD: case ExpandsOnNFKC:
result += "Expands_On_" + NAME[propMask-ExpandsOnNFD] + "\r\n# Generated according to UAX #15."
+ "\r\n# Characters whose normalized length is not one."
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
break;
case GenNFD: case GenNFC: case GenNFKD: case GenNFKC:
result += NAME[propMask-GenNFD] + "\r\n# Generated according to UAX #15."
+ "\r\n# Normalized forms, where different from the characters themselves."
+ ((propMask == 5 || propMask == 3)
? ""
: "\r\n# HANGUL SYLLABLES are algorithmically decomposed, and not listed explicitly.")
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# It is NOT sufficient to replace characters one-by-one with these results!";
break;
case ID_Start: result +=
"ID_Start"
+ "\r\n# Characters that can start an identifier."
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl";
break;
case ID_Continue_NO_Cf: result +=
"ID_Continue"
+ "\r\n# Characters that can continue an identifier."
+ "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc"
+ "\r\n# NOTE: Cf characters should be filtered out.";
break;
case Mod_ID_Start: result +=
"XID_Start"
+ "\r\n# ID_Start modified for closure under NFKx"
+ "\r\n# Modified as described in UAX #15"
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
break;
case Mod_ID_Continue_NO_Cf: result +=
"XID_Continue"
+ "\r\n# Mod_ID_Continue modified for closure under NFKx"
+ "\r\n# Modified as described in UAX #15"
+ "\r\n# NOTE: Cf characters should be filtered out."
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
break;
case PropMath:
result += "Math"
+ "\r\n# Generated from: Sm + Other_Math";
break;
case PropAlphabetic:
result += "Alphabetic"
+ "\r\n# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic";
break;
case PropLowercase:
result += "Lowercase"
+ "\r\n# Generated from: Ll + Other_Lowercase";
break;
case PropUppercase: result +=
"Uppercase"
+ "\r\n# Generated from: Lu + Other_Uppercase";
break;
case Missing_Uppercase: result +=
"Missing_Uppercase"
+ "\r\n# Generated from: NFKD has >0 Uppercase, no other cases";
break;
case Missing_Lowercase: result +=
"Missing_Lowercase"
+ "\r\n# Generated from: NFKD has >0 Lowercase, no other cases";
break;
case Missing_Mixedcase: result +=
"Missing_Mixedcase"
+ "\r\n# Generated from: NFKD has >0 Mixedcase, no other cases";
break;
case FullCompExclusion: result +=
"Full Composition Exclusion"
+ "\r\n# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions";
break;
case FullCompInclusion: result +=
"Full Composition Inclusion"
+ "\r\n# characters with Canonical Decompositions MINUS Full Composition Exclusion";
break;
case FC_NFKC_Closure: result +=
"FC_NFKC_Closure"
+ "\r\n# Generated from computing: b = NFKC(Fold(a)); c = NFKC(Fold(b));"
+ "\r\n# Then if (c != b) add the mapping from a to c to the set of"
+ "\r\n# mappings that constitute the FC_NFKC_Closure list";
break;
case QuickNFD: case QuickNFC: case QuickNFKD: case QuickNFKC:
result += NAME[propMask-QuickNFD] + "_QuickCheck"
+ "\r\n# Generated from computing decomposibles"
+ ((propMask == QuickNFC || propMask == QuickNFKC)
? " (and characters that may compose with previous ones)" : "");
break;
default: result += "Unimplemented!!";
}
return result;
}
public String propertyName(int cp) {
switch (propMask) {
case ExpandsOnNFD: case ExpandsOnNFC: case ExpandsOnNFKD: case ExpandsOnNFKC:
return "Expands_On_" + NAME[propMask-ExpandsOnNFD];
case GenNFD: case GenNFC: case GenNFKD: case GenNFKC:
if (cp >= 0xAC00 && cp <= 0xD7A3) return NAME[propMask-GenNFD] + "; " + "<algorithmic normalization>";
String norm = Utility.hex(nf[propMask-GenNFD].normalize(cp));
String pad = Utility.repeat(" ", 14-norm.length());
return NAME[propMask-GenNFD] + "; " + norm + pad;
case ID_Start: return "ID_Start";
case ID_Continue_NO_Cf: return "ID_Continue";
case Mod_ID_Start: return "XID_Start";
case Mod_ID_Continue_NO_Cf: return "XID_Continue";
case PropMath: return "Math";
case PropAlphabetic: return "Alphabetic";
case PropLowercase: return "Lowercase";
case PropUppercase: return "Uppercase";
case Missing_Uppercase: return "Possible_Missing_Uppercase";
case Missing_Lowercase: return "Possible_Missing_Lowercase";
case Missing_Mixedcase: return "Possible_Missing_Titlecase";
case FullCompExclusion: return "Comp_Ex";
case FullCompInclusion: return "Comp_In";
case FC_NFKC_Closure: return "FNC; " + Utility.hex(getComputedValue(cp));
case QuickNFD: case QuickNFC: case QuickNFKD: case QuickNFKC:
return NAME[propMask-QuickNFD] + "_" + getComputedValue(cp);
default: return "Unimplemented!!";
}
}
//public String optionalComment(int cp) {
// return super.optionalComment(cp) + " [" + ucdData.getCodeAndName(computedValue) + "]";
//}
public int minPropertyWidth() {
return width;
}
static final String[] NAME = {"NFD", "NFC", "NFKD", "NFKC"};
/*
public String optionalComment(int cp) {
String id = ucdData.getCategoryID(cp);
if (UCD.mainCategoryMask(ucdData.getCategory(cp)) == LETTER_MASK) return id.substring(0,1) + "*";
return id;
}
*/
/*
public String optionalName(int cp) {
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
return Utility.hex(ucdData.getDecompositionMapping(cp));
} else {
return "";
}
}
*/
public byte status(int cp) {
if (!ucdData.isAssigned(cp)) return EXCLUDE;
//if (cp == 0xFFFF) {
// System.out.println("# " + Utility.hex(cp));
//}
byte cat = ucdData.getCategory(cp);
//if (cp == 0x0385) {
// System.out.println(Utility.hex(firstRealCp));
//}
String cps;
byte xCat;
switch (propMask) {
default: return EXCLUDE;
case ExpandsOnNFD: case ExpandsOnNFC: case ExpandsOnNFKD: case ExpandsOnNFKC:
if (ucdData.getDecompositionType(cp) == NONE) return EXCLUDE;
cps = UTF32.valueOf32(cp);
if (UTF32.length32(nf[propMask-ExpandsOnNFD].normalize(cps)) == UTF32.length32(cps)) return EXCLUDE;
break;
case GenNFD: case GenNFC: case GenNFKD: case GenNFKC:
if (ucdData.getDecompositionType(cp) == NONE) return EXCLUDE;
cps = UTF32.valueOf32(cp);
if (cps.equals(nf[propMask-GenNFD].normalize(cps))) {
return EXCLUDE;
}
if (cp >= 0xAC00 && cp <= 0xD7A3) return INCLUDE;
//System.out.println(Utility.hex(cps) + " => " + Utility.hex(nf[propMask-4].normalize(cps)));
return BREAK;
case ID_Start:
if (ucdData.isIdentifierStart(cp, false)) return INCLUDE;
return EXCLUDE;
case ID_Continue_NO_Cf:
if (ucdData.isIdentifierContinue_NO_Cf(cp, false)) return INCLUDE;
return EXCLUDE;
case Mod_ID_Start:
if (ucdData.isIdentifierStart(cp, true)) return INCLUDE;
return EXCLUDE;
case Mod_ID_Continue_NO_Cf:
if (ucdData.isIdentifierContinue_NO_Cf(cp, true)) return INCLUDE;
return EXCLUDE;
case PropMath:
if (cat == Sm
|| ucdData.getBinaryProperty(cp,Math_Property)) return INCLUDE;
return EXCLUDE;
case PropAlphabetic:
if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl
|| ucdData.getBinaryProperty(cp, Alphabetic)) return INCLUDE;
case PropLowercase:
if (cat == Ll
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return INCLUDE;
return EXCLUDE;
case PropUppercase:
if (cat == Lu
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return INCLUDE;
return EXCLUDE;
case Missing_Uppercase:
if (cat == Lu
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return EXCLUDE;
xCat = getDecompCat(cp);
if (xCat == Lu) return INCLUDE;
return EXCLUDE;
case Missing_Lowercase:
if (cat == Ll
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return EXCLUDE;
xCat = getDecompCat(cp);
if (xCat == Ll) return INCLUDE;
return EXCLUDE;
case Missing_Mixedcase:
if (cat == Lt) return EXCLUDE;
xCat = getDecompCat(cp);
if (xCat == Lt) return INCLUDE;
return EXCLUDE;
case FullCompExclusion:
/*
(3) Singleton Decompositions: characters that can be derived from the UnicodeData file by
including all characters whose canonical decomposition consists of a single character.
(4) Non-Starter Decompositions: characters that can be derived from the UnicodeData
file by including all characters whose canonical decomposition consists of a sequence
of characters, the first of which has a non-zero combining class.
*/
{
if (!ucdData.isRepresented(cp)) return EXCLUDE;
byte dtype = ucdData.getDecompositionType(cp);
if (dtype != CANONICAL) return EXCLUDE;
if (isCompEx(cp)) return INCLUDE;
return EXCLUDE;
}
case FullCompInclusion:
{
if (!ucdData.isRepresented(cp)) return EXCLUDE;
byte dtype = ucdData.getDecompositionType(cp);
if (dtype != CANONICAL) return EXCLUDE;
if (isCompEx(cp)) return EXCLUDE;
return INCLUDE;
}
case FC_NFKC_Closure:
if (!ucdData.isRepresented(cp)) return EXCLUDE;
/*
b = Normalize(Fold(a));
c = Normalize(Fold(b));
if (c != b) add a => c
*/
{
String b = nfkc.normalize(fold(cp));
String c = nfkc.normalize(fold(b));
if (c.equals(b)) return EXCLUDE;
setComputedValue(cp, c);
if (cp == 0x1F88) {
System.out.println(ucdData.toString(cp));
System.out.println("cp: " + ucdData.getCodeAndName(cp));
System.out.println("fold(cp): " + ucdData.getCodeAndName(fold(cp)));
System.out.println("b: " + ucdData.getCodeAndName(b));
System.out.println("fold(b): " + ucdData.getCodeAndName(fold(b)));
System.out.println("c: " + ucdData.getCodeAndName(c));
}
return BREAK;
}
case QuickNFD: case QuickNFC: case QuickNFKD: case QuickNFKC:
lastValue = currentValue;
Normalizer nfx = nf[propMask - QuickNFD];
if (nfx.normalizationDiffers(cp)) currentValue = "NO";
else if (nfx.isTrailing(cp)) currentValue = "MAYBE";
else return EXCLUDE;
setComputedValue(cp, currentValue);
if (currentValue != lastValue) return BREAK;
return INCLUDE;
}
// handle script stuff
/*
if (firstRealCp == -1) return INCLUDE;
byte cat2 = ucdData.getCategory(firstRealCp);
if (cat == cat2) return INCLUDE;
int mc = UCD.mainCategoryMask(cat);
if (LETTER_MASK == mc && mc == UCD.mainCategoryMask(cat2)) return INCLUDE;
return BREAK;
*/
return INCLUDE;
}
static Map computedValue = new HashMap();
static String getComputedValue(int cp) {
return (String) computedValue.get(new Integer(cp));
}
static void setComputedValue(int cp, String value) {
computedValue.put(new Integer(cp), value);
}
static String lastValue = "";
static String currentValue = "";
boolean isCompEx(int cp) {
if (ucdData.getBinaryProperty(cp, CompositionExclusion)) return true;
String decomp = ucdData.getDecompositionMapping(cp);
if (UTF32.length32(decomp) == 1) return true;
int first = UTF32.char32At(decomp,0);
if (ucdData.getCombiningClass(first) != 0) return true;
return false;
}
StringBuffer foldBuffer = new StringBuffer();
String fold(int cp) {
return ucdData.getCase(cp, FULL, FOLD);
}
String fold(String s) {
return ucdData.getCase(s, FULL, FOLD);
}
byte getDecompCat(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lu
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return Lu;
if (cat == Ll
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return Ll;
if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) return cat;
if (!nf[2].normalizationDiffers(cp)) return Lo;
String norm = nf[2].normalize(cp);
int cp2;
boolean gotUpper = false;
boolean gotLower = false;
boolean gotTitle = false;
for (int i = 0; i < norm.length(); i += UTF32.count16(cp2)) {
cp2 = UTF32.char32At(norm, i);
byte catx = ucdData.getCategory(cp2);
boolean upx = ucdData.getBinaryProperty(cp, Other_Uppercase);
boolean lowx = ucdData.getBinaryProperty(cp, Other_Lowercase);
if (catx == Ll || lowx || cp2 == 0x345) gotLower = true;
if (catx == Lu || upx) gotUpper = true;
if (catx == Lt) gotTitle = true;
}
if (gotLower && !gotUpper && !gotTitle) return Ll;
if (!gotLower && gotUpper && !gotTitle) return Lu;
if (gotLower || gotUpper || gotTitle) return Lt;
return cat;
}
}

View file

@ -0,0 +1,65 @@
package com.ibm.text.UCD;
import java.io.*;
class DiffPropertyLister extends PropertyLister {
private UCD oldUCD;
public DiffPropertyLister(String oldUCDName, String newUCDName, PrintStream output) {
this.output = output;
this.ucdData = UCD.make(newUCDName);
if (oldUCDName != null) this.oldUCD = UCD.make(oldUCDName);
}
public byte status (int cp) {
return INCLUDE;
}
public String propertyName(int cp) {
return ucdData.getVersion();
}
/*
public String optionalName(int cp) {
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
return Utility.hex(ucdData.getDecompositionMapping(cp));
} else {
return "";
}
}
*/
public byte status(int lastCp, int cp) {
/*if (cp == 0xFFFF) {
System.out.println("# " + Utility.hex(cp));
}
*/
return ucdData.isAllocated(cp) && (oldUCD == null || !oldUCD.isAllocated(cp)) ? INCLUDE : EXCLUDE;
}
public int print() {
String status;
if (oldUCD != null) {
status = "# Differences between " + ucdData.getVersion() + " and " + oldUCD.getVersion();
} else {
status = "# Allocated as of " + ucdData.getVersion();
}
output.println();
output.println();
output.println(status);
output.println();
System.out.println(status);
int count = super.print();
output.println();
if (oldUCD != null) {
output.println("# Total " + count + " new code points allocated in " + ucdData.getVersion());
} else {
output.println("# Total " + count + " code points allocated in " + ucdData.getVersion());
}
output.println();
return count;
}
}

View file

@ -0,0 +1,342 @@
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import com.ibm.text.utility.*;
public class GenerateCaseFolding implements UCD_Types {
public static boolean DEBUG = false;
public static UCD ucd = UCD.make("310");
public static void main(String[] args) throws java.io.IOException {
makeCaseFold();
//getAge();
}
public static void makeCaseFold() throws java.io.IOException {
System.out.println("Making Full Data");
Map fullData = getCaseFolding(true);
System.out.println("Making Simple Data");
Map simpleData = getCaseFolding(false);
// write the data
System.out.println("Writing");
PrintWriter out = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream("CaseFoldingSample.txt"),
"UTF8"),
4*1024));
for (int ch = 0; ch < 0x10FFFF; ++ch) {
String rFull = (String)fullData.get(UTF32.valueOf32(ch));
String rSimple = (String)simpleData.get(UTF32.valueOf32(ch));
if (rFull == null && rSimple == null) continue;
if (rFull != null && rFull.equals(rSimple)) {
String type = "C";
if (ch == 0x130 || ch == 0x131) type = "I";
drawLine(out, ch, type, rFull);
} else {
if (rFull != null) {
drawLine(out, ch, "F", rFull);
}
if (rSimple != null) {
drawLine(out, ch, "S", rSimple);
}
}
}
out.close();
}
static void drawLine(PrintWriter out, int ch, String type, String result) {
out.println(Utility.hex(ch)
+ "; " + type +
"; " + Utility.hex(result, " ") +
"; # " + ucd.getName(ch));
}
static Map getCaseFolding(boolean full) throws java.io.IOException {
Map data = new TreeMap();
Map repChar = new TreeMap();
//String option = "";
// get the equivalence classes
for (int ch = 0; ch < 0x10FFFF; ++ch) {
if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
if (!ucd.isRepresented(ch)) continue;
getClosure(ch, data, full);
}
// get the representative characters
Iterator it = data.keySet().iterator();
while (it.hasNext()) {
String s = (String) it.next();
Set set = (Set) data.get(s);
String rep = null;
int repGood = 0;
String dup = null;
Iterator it2 = set.iterator();
while (it2.hasNext()) {
String s2 = (String)it2.next();
int s2Good = goodness(s2, full);
if (s2Good > repGood) {
rep = s2;
repGood = s2Good;
dup = null;
} else if (s2Good == repGood) {
dup = s2;
}
}
if (rep == null) System.err.println("No representative for: " + toString(set));
else if (repGood < 128) {
System.err.println("Non-optimal!!: "
+ ucd.getName(rep) + ", " + toString(set,true));
}
it2 = set.iterator();
while (it2.hasNext()) {
String s2 = (String)it2.next();
if (s2.length() == 1 && !s2.equals(rep)) repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
}
}
return repChar;
}
static int goodness(String s, boolean full) {
if (s == null) return 0;
int result = s.length();
if (s.equals(lower(upper(s, full), full))) result |= 128;
if (s.equals(NFC.normalize(s))) result |= 64;
return result;
}
static Normalizer NFC = new Normalizer(Normalizer.NFC);
/*
static HashSet temp = new HashSet();
static void normalize(HashSet set) {
temp.clear();
temp.addAll(set);
set.clear();
Iterator it = temp.iterator();
while (it.hasNext()) {
String s = (String) it.next();
String s2 = KC.normalize(s);
set.add(s);
data2.put(s,set);
if (!s.equals(s2)) {
set.add(s2);
data2.put(s2,set);
System.err.println("Adding " + Utility.hex(s) + " by " + Utility.hex(s2));
}
}
}
*/
/*
String
String lower1 = ucd.getLowercase(ch);
String lower2 = ucd.toLowercase(ch,option);
char ch2 = ucd.getLowercase(ucd.getUppercase(ch).charAt(0)).charAt(0);
//String lower1 = String.valueOf(ucd.getLowercase(ch));
//String lower = ucd.toLowercase(ch2,option);
String upper = ucd.toUppercase(ch2,option);
String lowerUpper = ucd.toLowercase(upper,option);
//String title = ucd.toTitlecase(ch2,option);
//String lowerTitle = ucd.toLowercase(upper,option);
if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { //
output.println(Utility.hex(ch)
+ "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E")
+ "; " + Utility.hex(lowerUpper," ")
+ ";\t#" + ucd.getName(ch)
);
//if (!lowerUpper.equals(lower)) {
// output.println("Warning1: " + Utility.hex(lower) + " " + ucd.getName(lower));
//}
//if (!lowerUpper.equals(lowerTitle)) {
// output.println("Warning2: " + Utility.hex(lowerTitle) + " " + ucd.getName(lowerTitle));
//}
}
*/
static void getClosure(int ch, Map data, boolean full) {
String charStr = UTF32.valueOf32(ch);
String lowerStr = lower(charStr, full);
String titleStr = title(charStr, full);
String upperStr = upper(charStr, full);
if (charStr.equals(lowerStr) && charStr.equals(upperStr) && charStr.equals(titleStr)) return;
if (DEBUG) System.err.println("Closure for " + Utility.hex(ch));
// make new set
Set set = new TreeSet();
set.add(charStr);
data.put(charStr, set);
// add cases to get started
add(set, lowerStr, data);
add(set, upperStr, data);
add(set, titleStr, data);
// close it
main:
while (true) {
Iterator it = set.iterator();
while (it.hasNext()) {
String s = (String) it.next();
// do funny stuff since we can't modify set while iterating
//if (add(set, NFC.normalize(s), data)) continue main;
if (add(set, lower(s, full), data)) continue main;
if (add(set, title(s, full), data)) continue main;
if (add(set, upper(s, full), data)) continue main;
}
break;
}
}
static String lower(String s, boolean full) {
String result = lower2(s,full);
return result.replace('\u03C2', '\u03C3'); // HACK for lower
}
// These functions are no longer necessary, since UCD is parameterized,
// but it's not worth changing
static String lower2(String s, boolean full) {
if (!full) {
if (s.length() != 1) return s;
return ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
}
return ucd.getCase(s, FULL, LOWER);
}
static String upper(String s, boolean full) {
if (!full) {
if (s.length() != 1) return s;
return ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
}
return ucd.getCase(s, SIMPLE, UPPER);
}
static String title(String s, boolean full) {
if (!full) {
if (s.length() != 1) return s;
return ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
}
return ucd.getCase(s, SIMPLE, TITLE);
}
static boolean add(Set set, String s, Map data) {
if (set.contains(s)) return false;
set.add(s);
if (DEBUG) System.err.println("adding: " + toString(set));
Set other = (Set) data.get(s);
if (other != null && other != set) { // merge
// make all the items in set point to merged set
Iterator it = other.iterator();
while (it.hasNext()) {
data.put(it.next(), set);
}
set.addAll(other);
}
if (DEBUG) System.err.println("done adding: " + toString(set));
return true;
}
static String toString(Set set) {
String result = "{";
Iterator it2 = set.iterator();
boolean first = true;
while (it2.hasNext()) {
String s2 = (String) it2.next();
if (!first) result += ", ";
first = false;
result += Utility.hex(s2, " ");
}
return result + "}";
}
static String toString(Set set, boolean t) {
String result = "{";
Iterator it2 = set.iterator();
boolean first = true;
while (it2.hasNext()) {
String s2 = (String) it2.next();
if (!first) result += ", ";
first = false;
result += ucd.getName(s2);
}
return result + "}";
}
static final void getAge() throws IOException {
PrintStream log = new PrintStream(
new BufferedOutputStream (
new FileOutputStream("UnicodeAge.txt"),
4*1024));
try {
log.println("# Derived file showing when various code points were allocated in Unicode");
log.println("# author: M. Davis");
log.println("# generated: " + new Date());
log.println("# Notes:");
log.println("# - The old Hangul Syllables (removed from 2.0) are not included in the 110 listing.");
log.println("# - The supplementary private use code points, although allocated earlier,");
log.println("# were NOT specifically listed in the UCD until 3.0.1, and are not included until then.");
new DiffPropertyLister(null, "110", log).print();
new DiffPropertyLister("110", "200", log).print();
new DiffPropertyLister("200", "210", log).print();
new DiffPropertyLister("210", "300", log).print();
new DiffPropertyLister("300", "310", log).print();
/*
printDiff("110", "200");
UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false);
UnicodeSet u20 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.0.txt", false);
UnicodeSet u21 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.1.txt", false);
UnicodeSet u30 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.0.txt", false);
UnicodeSet u31 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.1.txt", false);
log.println();
log.println("# Code points assigned in Unicode 1.1 (minus Hangul Syllables): "
+ n.format(u11.count()));
log.println();
u11.print(log, false, false, "1.1");
UnicodeSet u20m = new UnicodeSet(u20).remove(u11);
log.println();
log.println("# Code points assigned in Unicode 2.0 (minus Unicode 1.1): "
+ n.format(u20m.count()));
log.println();
u20m.print(log, false, false, "2.0");
UnicodeSet u21m = new UnicodeSet(u21).remove(u20);
log.println();
log.println("# Code points assigned in Unicode 2.1 (minus Unicode 2.0): "
+ n.format(u21m.count()));
log.println();
u21m.print(log, false, false, "2.1");
UnicodeSet u30m = new UnicodeSet(u30).remove(u21);
log.println();
log.println("# Code points assigned in Unicode 3.0 (minus Unicode 2.1): "
+ n.format(u30m.count()));
log.println();
u30m.print(log, false, false, "3.0");
UnicodeSet u31m = new UnicodeSet(u31).remove(u30);
log.println();
log.println("# Code points assigned in Unicode 3.1 (minus Unicode 3.0): "
+ n.format(u31m.count()));
log.println();
u31m.print(log, false, false, "3.1");
*/
} finally {
if (log != null) log.close();
}
}
}

View file

@ -0,0 +1,667 @@
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import com.ibm.text.utility.*;
public class GenerateData implements UCD_Types {
public static void main (String[] args) throws IOException {
System.out.println("START");
ucd = UCD.make();
System.out.println("Loaded UCD " + ucd.getVersion() + " " + (new Date(ucd.getDate())));
String version = ucd.getVersion();
for (int i = 0; i < args.length; ++i) {
String arg = args[i];
if (arg.charAt(0) == '#') return; // skip rest of line
int mask = 0;
Utility.fixDot();
System.out.println("Argument: " + args[i]);
if (arg.equalsIgnoreCase("version")) {
version = args[++i];
ucd = UCD.make(version);
} else if (arg.equalsIgnoreCase("partition")) {
partitionProperties();
} else if (arg.equalsIgnoreCase("list")) {
listProperties();
} else if (arg.equalsIgnoreCase("diff")) {
listDifferences();
} else if (arg.equalsIgnoreCase("DerivedBidiClass")) {
generateVerticalSlice(BIDI_CLASS, BIDI_CLASS+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedBidiClass-" + version );
} else if (arg.equalsIgnoreCase("DerivedNormalizationProperties")) {
mask = Utility.setBits(0, DerivedPropertyLister.FC_NFKC_Closure, DerivedPropertyLister.ExpandsOnNFKC);
mask = Utility.clearBit(mask, DerivedPropertyLister.FullCompInclusion);
generateDerived(mask, HEADER_DERIVED, "DerivedNormalizationProperties-" + version );
} else if (arg.equalsIgnoreCase("DerivedEastAsianWidth")) {
generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedEastAsianWidth-" + version );
} else if (arg.equalsIgnoreCase("DerivedGeneralCategory")) {
generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedGeneralCategory-" + version );
} else if (arg.equalsIgnoreCase("DerivedCombiningClass")) {
generateVerticalSlice(COMBINING_CLASS, COMBINING_CLASS+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedCombiningClass-" + version );
} else if (arg.equalsIgnoreCase("DerivedDecompositionType")) {
generateVerticalSlice(DECOMPOSITION_TYPE, DECOMPOSITION_TYPE+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedDecompositionType-" + version );
} else if (arg.equalsIgnoreCase("DerivedNumericType")) {
generateVerticalSlice(NUMERIC_TYPE, NUMERIC_TYPE+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedNumericType-" + version );
} else if (arg.equalsIgnoreCase("DerivedEastAsianWidth")) {
generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedEastAsianWidth-" + version );
} else if (arg.equalsIgnoreCase("DerivedJoiningType")) {
generateVerticalSlice(JOINING_TYPE, JOINING_TYPE+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedJoiningType-" + version );
} else if (arg.equalsIgnoreCase("DerivedJoiningGroup")) {
generateVerticalSlice(JOINING_GROUP, JOINING_GROUP+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedJoiningGroup-" + version );
} else if (arg.equalsIgnoreCase("DerivedBinaryProperties")) {
generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES+1, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedBinaryProperties-" + version );
} else if (arg.equalsIgnoreCase("DerivedNumericValues")) {
generateVerticalSlice(LIMIT_ENUM, LIMIT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedNumericValues-" + version );
} else if (arg.equalsIgnoreCase("DerivedCoreProperties")) {
mask = Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf);
generateDerived(mask, HEADER_DERIVED, "DerivedCoreProperties-" + version );
} else if (arg.equalsIgnoreCase("DerivedLineBreak")) {
generateVerticalSlice(LINE_BREAK, LINE_BREAK+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedLineBreak-" + version );
} else if (arg.equalsIgnoreCase("Scripts")) {
generateVerticalSlice(SCRIPT+1, SCRIPT + NEXT_ENUM, KEEP_SPECIAL, HEADER_SCRIPTS, "Scripts-");
} else if (arg.equalsIgnoreCase("PropList")) {
generateVerticalSlice(BINARY_PROPERTIES + White_space, BINARY_PROPERTIES + Noncharacter_Code_Point + 1,
KEEP_SPECIAL, HEADER_EXTEND, "PropList-" + version);
} else if (arg.equalsIgnoreCase("AllBinary")) {
generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES + NEXT_ENUM,
KEEP_SPECIAL, HEADER_EXTEND, "AllBinary-" + version);
} else if (arg.equalsIgnoreCase("NormalizationTest")) {
writeNormalizerTestSuite("NormalizationTest-" + version + ".txt" );
} else if (arg.equalsIgnoreCase("generateCompExclusions")) {
generateCompExclusions();
}else {
System.out.println(" ! Unknown option -- must be one of the following (case-insensitive)");
System.out.println(" ! generateCompExclusions,...");
}
//checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F");
//checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD");
//generateDerived(Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf),
// HEADER_DERIVED, "DerivedPropData2-" + version );
//generateVerticalSlice(SCRIPT, SCRIPT+1, KEEP_SPECIAL, "ScriptCommon-" + version );
//listStrings("LowerCase-" + version , 0,0);
//generateVerticalSlice(0, LIMIT_ENUM, SKIP_SPECIAL, PROPLIST1, "DerivedPropData1-" + version );
// AGE stuff
//UCD ucd = UCD.make();
//System.out.println(ucd.getAgeID(0x61));
//System.out.println(ucd.getAgeID(0x2FA1D));
//
}
System.out.println("END");
}
static Normalizer nfkc = new Normalizer(Normalizer.NFKC);
public static void checkHoffman(String test) {
String result = nfkc.normalize(test);
System.out.println(Utility.hex(test) + " => " + Utility.hex(result));
System.out.println();
show(test, 0);
System.out.println();
show(result, 0);
}
public static void show(String s, int indent) {
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
String cc = " " + ucd.getCombiningClass(cp);
cc = Utility.repeat(" ", 4 - cc.length()) + cc;
System.out.println(Utility.repeat(" ", indent) + ucd.getCode(cp) + cc + " " + ucd.getName(cp));
String decomp = nfkc.normalize(cp);
if (!decomp.equals(UTF32.valueOf32(cp))) {
show(decomp, indent + 4);
}
}
}
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd' 'HH:mm:ss.S' GMT'");
static {
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
}
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
public static String fixFile(String s) {
int len = s.length();
if (!s.endsWith(".txt")) return s;
if (s.charAt(len-6) != 'd') return s;
char c = s.charAt(len-5);
if (c < '0' || '9' < c) return s;
System.out.println("Fixing File Name");
return s.substring(0,len-6) + s.substring(len-4);
}
static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2;
public static void doHeader(String fileName, PrintStream output, int headerChoice) {
output.println("# " + fileName + ".txt");
output.println("#");
if (headerChoice == HEADER_SCRIPTS) {
output.println("# For documentation, see UTR #24: Script Names");
output.println("# http://www.unicode.org/unicode/reports/tr24/");
} else if (headerChoice == HEADER_EXTEND) {
output.println("# Unicode Character Database: Extended Properties");
output.println("# For documentation, see PropList.html");
} else {
output.println("# Unicode Character Database: Derived Property Data");
output.println("# Generated algorithmically from the Unicode Character Database");
output.println("# For documentation, see DerivedProperties.html");
}
output.println("# Date: " + myDateFormat.format(new Date()) + " [MD]");
output.println("# Note: Unassigned and Noncharacter codepoints are omitted,");
output.println("# except when listing Noncharacter or Cn.");
output.println("# ================================================");
output.println();
}
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
ucd = UCD.make("310");
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + fileName));
doHeader(fileName, output, headerChoice);
for (int i = 0; i < 32; ++i) {
if ((bitMask & (1<<i)) == 0) continue;
if (i >= DerivedPropertyLister.LIMIT) break;
System.out.print('.');
output.println("# ================================================");
output.println();
new DerivedPropertyLister(ucd, i, output).print();
}
output.close();
}
/*
public static void listStrings(String file, int type, int subtype) throws IOException {
ucd = UCD.make("310");
UCD ucd30 = UCD.make("300");
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
for (int i = 0; i < 0x10FFFF; ++i) {
if ((i & 0xFFF) == 0) System.out.println("# " + i);
if (!ucd.isRepresented(i)) continue;
if (ucd30.isRepresented(i)) continue;
String string = "";
switch(type) {
case 0: string = ucd.getSimpleLowercase(i);
}
if (UTF32.length32(string) == 1 && UTF32.char32At(string,0) == i) continue;
output.println(Utility.hex(i) + "; C; " + Utility.hex(string) + "; # " + ucd.getName(i));
}
output.close();
}
*/
public static void generateCompExclusions() throws IOException {
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "CompositionExclusionsDelta.txt"));
new CompLister(output).print();
output.close();
}
static class CompLister extends PropertyLister {
UCD oldUCD;
int oldLength = 0;
public CompLister(PrintStream output) {
this.output = output;
ucdData = UCD.make("310");
oldUCD = UCD.make("300");
showOnConsole = true;
}
public String propertyName(int cp) {
return UTF32.length32(ucdData.getDecompositionMapping(cp)) + "";
}
public byte status(int cp) {
if (ucdData.getDecompositionType(cp) == CANONICAL
&& oldUCD.getDecompositionType(cp) != CANONICAL) {
int temp = oldLength;
oldLength = UTF32.length32(ucdData.getDecompositionMapping(cp));
if (temp != oldLength) return BREAK;
return INCLUDE;
}
return EXCLUDE;
}
}
public static void partitionProperties() throws IOException {
// find properties
int count = 0;
int[] props = new int[500];
for (int i = 1; i < LIMIT_ENUM; ++i) { // || iType == SCRIPT
int iType = i & 0xFF00;
if (iType == JOINING_GROUP || iType == AGE || iType == COMBINING_CLASS) continue;
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
props[count++] = i;
}
System.out.println("props: " + count);
BitSet probe = new BitSet();
Map map = new HashMap();
int total = 0;
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
int cat = ucd.getCategory(cp);
if (cat == UNASSIGNED || cat == PRIVATE_USE || cat == SURROGATE) continue;
if (!ucd.isAllocated(cp)) continue;
for (int i = 0; i < count; ++i) {
boolean iProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, props[i]);
if (iProp) probe.set(i); else probe.clear(i);
}
++total;
if (!map.containsKey(probe)) {
map.put(probe.clone(), UTF32.valueOf32(cp));
Utility.fixDot();
System.out.println("Set Size: " + map.size() + ", total: " + total + ", " + ucd.getCodeAndName(cp));
}
}
Utility.fixDot();
System.out.println("Set Size: " + map.size());
}
public static void listDifferences() throws IOException {
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "PropertyDifferences.txt"));
for (int i = 1; i < LIMIT_ENUM; ++i) {
int iType = i & 0xFF00;
if (iType == JOINING_GROUP || iType == AGE || iType == COMBINING_CLASS || iType == SCRIPT) continue;
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
String iNameShort = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.SHORT);
String iNameLong = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.LONG);
System.out.println();
System.out.println();
System.out.println(iNameLong);
output.println("#" + iNameLong);
int last = -1;
for (int j = i+1; j < LIMIT_ENUM; ++j) {
int jType = j & 0xFF00;
if (jType == JOINING_GROUP || jType == AGE || jType == COMBINING_CLASS || jType == SCRIPT
|| (jType == iType && jType != BINARY_PROPERTIES)) continue;
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, j)) continue;
if ((j >> 8) != last) {
last = j >> 8;
System.out.println();
System.out.print("\t" + UCD_Names.SHORT_UNIFIED_PROPERTIES[last]);
output.flush();
output.println("#\t" + UCD_Names.SHORT_UNIFIED_PROPERTIES[last]);
} else {
System.out.print('.');
}
System.out.flush();
int bothCount = 0, i_jPropCount = 0, j_iPropCount = 0, iCount = 0, jCount = 0;
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
int cat = ucd.getCategory(cp);
if (cat == UNASSIGNED || cat == PRIVATE_USE || cat == SURROGATE) continue;
if (!ucd.isAllocated(cp)) continue;
boolean iProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, i);
boolean jProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, j);
if (jProp) ++jCount;
if (iProp) {
++iCount;
if (jProp) ++bothCount;
else ++i_jPropCount;
} else if (jProp) ++j_iPropCount;
}
if (iCount == 0 || jCount == 0) continue;
String jNameShort = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, j, MyPropertyLister.SHORT);
//String jNameLong = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, j, MyPropertyLister.LONG);
String rel = bothCount == 0 ? "DISJOINT"
: i_jPropCount == 0 && j_iPropCount == 0 ? "EQUALS"
: i_jPropCount == 0 ? "CONTAINS" // depends on reverse output
: j_iPropCount == 0 ? "CONTAINS"
: "OVERLAPS";
if (j_iPropCount > i_jPropCount) {
// reverse output
output.println(jNameShort + "\t" + iNameShort + "\t" + rel
+ "\t" + bothCount + "\t" + j_iPropCount + "\t" + i_jPropCount);
} else {
output.println(iNameShort + "\t" + jNameShort + "\t" + rel
+ "\t" + bothCount + "\t" + i_jPropCount + "\t" + j_iPropCount);
}
}
}
output.close();
}
public static void listProperties() {
for (int i = 0; i < LIMIT_ENUM; ++i) {
int type = i & 0xFF00;
if (type == JOINING_GROUP || type == AGE) continue;
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
String value = MyPropertyLister.getUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.LONG);
if (value.length() == 0) value = "none";
else if (value.equals("<unused>")) continue;
String abbvalue = MyPropertyLister.getUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.SHORT);
if (abbvalue.length() == 0) abbvalue = "no";
if (type == COMBINING_CLASS) {
value = MyPropertyLister.getCombiningName(i);
if (value.length() == 0) {
if ((i & 0xFF) == 0) value = "99";
else continue;
}
abbvalue = value;
}
String elide = "";
if (type == CATEGORY || type == SCRIPT || type == BINARY_PROPERTIES) elide = "\\p{"
+ abbvalue
+ "}";
String abb = "";
if (type != BINARY_PROPERTIES) abb = "\\p{"
+ UCD_Names.ABB_UNIFIED_PROPERTIES[i>>8]
+ "="
+ abbvalue
+ "}";
String norm = "";
if (type != BINARY_PROPERTIES) norm = "\\p{"
+ UCD_Names.SHORT_UNIFIED_PROPERTIES[i>>8]
+ "="
+ value
+ "}";
System.out.println("<tr><td>" + elide + "</td><td>" + abb + "</td><td>" + norm + "</td></tr>");
}
}
static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
public static void generateVerticalSlice(int startEnum, int endEnum, byte skipSpecial,
int headerChoice, String file) throws IOException {
//System.out.println(ucd.toString(0x1E0A));
/*
System.out.println(ucd.getData(0xFFFF));
System.out.println(ucd.getData(0x100000));
System.out.println(ucd.getData(0x100000-1));
System.out.println(ucd.getData(0x100000-2));
System.out.println(ucd.getData(0x100000-3));
if (true) return;
String test2 = ucd.getName(0x2A6D6);
//*/
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file + "dX.txt"));
doHeader(file, output, headerChoice);
int last = -1;
for (int i = startEnum; i < endEnum; ++i) {
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|| i == (BINARY_PROPERTIES | Non_break)
|| i == (JOINING_TYPE | JT_U)
|| i == (JOINING_GROUP | NO_SHAPING)
) continue; // skip zero case
if (skipSpecial == SKIP_SPECIAL
&& i >= (BINARY_PROPERTIES | CompositionExclusion)
&& i < (AGE + NEXT_ENUM)) continue;
if ((last & 0xFF00) != (i & 0xFF00) && (i <= BINARY_PROPERTIES || i >= SCRIPT)) {
output.println();
output.println("# ================================================");
output.println("# " + UCD_Names.UNIFIED_PROPERTIES[i>>8]);
output.println("# ================================================");
output.println();
System.out.println();
System.out.println(UCD_Names.UNIFIED_PROPERTIES[i>>8]);
last = i;
} else {
output.println("# ================================================");
output.println();
}
System.out.print(".");
new MyPropertyLister(ucd, i, output).print();
}
if (endEnum == LIMIT_ENUM) {
output.println();
output.println("# ================================================");
output.println("# Numeric Values (from UnicodeData.txt, field 6/7/8)");
output.println("# ================================================");
output.println();
System.out.println();
System.out.println("@NUMERIC VALUES");
Set floatSet = new TreeSet();
for (int i = 0; i < 0x10FFFF; ++i) {
float nv = ucd.getNumericValue(i);
if (Float.isNaN(nv)) continue;
floatSet.add(new Float(nv));
}
Iterator it = floatSet.iterator();
while(it.hasNext()) {
new MyFloatLister(ucd, ((Float)it.next()).floatValue(), output).print();
output.println();
System.out.print(".");
}
}
output.close();
System.out.println();
}
static UCD ucd;
static public Normalizer formC, formD, formKC, formKD;
static public void writeNormalizerTestSuite(String fileName) throws IOException {
ucd = UCD.make();
PrintWriter log = Utility.openPrintWriter(fileName);
formC = new Normalizer(Normalizer.NFC);
formD = new Normalizer(Normalizer.NFD);
formKC = new Normalizer(Normalizer.NFKC);
formKD = new Normalizer(Normalizer.NFKD);
String[] example = new String[256];
log.println("# " + fixFile(fileName));
log.println("#");
log.println("# Normalization Test Suite");
log.println("# Date: " + myDateFormat.format(new Date()) + " [MD]");
log.println("# Format:");
log.println("#");
log.println("# Columns (c1, c2,...) are separated by semicolons");
log.println("# Comments are indicated with hash marks");
log.println("#");
log.println("# CONFORMANCE:");
log.println("# 1. The following invariants must be true for all conformant implementations");
log.println("#");
log.println("# NFC");
log.println("# c2 == NFC(c1) == NFC(c2) == NFC(c3)");
log.println("# c4 == NFC(c4) == NFC(c5)");
log.println("#");
log.println("# NFD");
log.println("# c3 == NFD(c1) == NFD(c2) == NFD(c3)");
log.println("# c5 == NFD(c4) == NFD(c5");
log.println("#");
log.println("# NFKC");
log.println("# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)");
log.println("#");
log.println("# NFKD");
log.println("# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)");
log.println("#");
log.println("# 2. For every assigned Unicode 3.1.0 code point X that is not specifically");
log.println("# listed in Part 1, the following invariants must be true for all conformant");
log.println("# implementations:");
log.println("#");
log.println("# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)");
System.out.println("Writing Part 1");
log.println("#");
log.println("@Part0 # Specific cases");
log.println("#");
for (int j = 0; j < testSuiteCases.length; ++j) {
writeLine(testSuiteCases[j], log, false);
}
System.out.println("Writing Part 2");
log.println("#");
log.println("@Part1 # Character by character test");
log.println("# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.");
log.println("#");
for (int ch = 0; ch < 0x10FFFF; ++ch) {
Utility.dot(ch);
if (!ucd.isAssigned(ch)) continue;
if (ucd.isPUA(ch)) continue;
String cc = UTF32.valueOf32(ch);
writeLine(cc,log, true);
}
Utility.fixDot();
System.out.println("Finding Examples");
for (int ch = 0; ch < 0x10FFFF; ++ch) {
Utility.dot(ch);
if (!ucd.isAssigned(ch)) continue;
if (ucd.isPUA(ch)) continue;
int cc = ucd.getCombiningClass(ch);
if (example[cc] == null) example[cc] = UTF32.valueOf32(ch);
}
Utility.fixDot();
System.out.println("Writing Part 2");
log.println("#");
log.println("@Part2 # Canonical Order Test");
log.println("#");
for (int ch = 0; ch < 0x10FFFF; ++ch) {
Utility.dot(ch);
if (!ucd.isAssigned(ch)) continue;
if (ucd.isPUA(ch)) continue;
short c = ucd.getCombiningClass(ch);
if (c == 0) continue;
// add character with higher class, same class, lower class
String sample = "";
for (int i = c+1; i < example.length; ++i) {
if (example[i] == null) continue;
sample += example[i];
break;
}
sample += example[c];
for (int i = c-1; i > 0; --i) {
if (example[i] == null) continue;
sample += example[i];
break;
}
writeLine("a" + sample + UTF32.valueOf32(ch) + "b", log, false);
writeLine("a" + UTF32.valueOf32(ch) + sample + "b", log, false);
}
Utility.fixDot();
log.println("#");
log.println("# END OF FILE");
log.close();
}
static void writeLine(String cc, PrintWriter log, boolean check) {
String c = formC.normalize(cc);
String d = formD.normalize(cc);
String kc = formKC.normalize(cc);
String kd = formKD.normalize(cc);
if (check & cc.equals(c) && cc.equals(d) && cc.equals(kc) && cc.equals(kd)) return;
// consistency check
String dc = formD.normalize(c);
String dkc = formD.normalize(kc);
if (!dc.equals(d) || !dkc.equals(kd)) {
System.out.println("Danger Will Robinson!");
Normalizer.SHOW_PROGRESS = true;
d = formD.normalize(cc);
}
// printout
log.println(
Utility.hex(cc," ") + ";" + Utility.hex(c," ") + ";" + Utility.hex(d," ") + ";"
+ Utility.hex(kc," ") + ";" + Utility.hex(kd," ")
+ "; # ("
+ comma(cc) + "; " + comma(c) + "; " + comma(d) + "; " + comma(kc) + "; " + comma(kd) + "; "
+ ") " + ucd.getName(cc));
}
static StringBuffer commaResult = new StringBuffer();
// not recursive!!!
static final String comma(String s) {
commaResult.setLength(0);
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(i)) {
cp = UTF32.char32At(s, i);
if (ucd.getCategory(cp) == Mn) commaResult.append('\u25CC');
UTF32.append32(commaResult, cp);
}
return commaResult.toString();
}
static final String[] testSuiteCases = {
"\u1E0A",
"\u1E0C",
"\u1E0A\u0323",
"\u1E0C\u0307",
"D\u0307\u0323",
"D\u0323\u0307",
"\u1E0A\u031B",
"\u1E0C\u031B",
"\u1E0A\u031B\u0323",
"\u1E0C\u031B\u0307",
"D\u031B\u0307\u0323",
"D\u031B\u0323\u0307",
"\u00C8",
"\u0112",
"E\u0300",
"E\u0304",
"\u1E14",
"\u0112\u0300",
"\u1E14\u0304",
"E\u0304\u0300",
"E\u0300\u0304",
"\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F",
"\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD"
};
}

View file

@ -0,0 +1,314 @@
package com.ibm.text.utility;
import java.io.*;
import java.util.*;
import com.ibm.text.UCD.*;
public class MLStreamWriter extends Writer {
public static final String copyright =
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
public MLStreamWriter (PrintWriter output, boolean HTML) {
out = output;
isHTML = HTML;
}
public MLStreamWriter (PrintWriter output) {
this(output,true);
}
public MLStreamWriter el(String elementName) {
closeIfOpen();
print('<', AFTER);
print(elementName, elementName.equals("!--") ? AFTER+FORCE : AFTER);
stack.add(elementName);
inElement = true;
return this;
}
private MLStreamWriter closeIfOpen() {
if (inElement && !"!--".equals(stack.get(stack.size()-1))) {
print('>',BEFORE+FORCE);
}
inElement = false;
return this;
}
final public MLStreamWriter cel(String elementName) {
return cl().tx(elementName);
}
public MLStreamWriter at(String attributeName, String attributeValue) {
if (!inElement) {
throw new IllegalArgumentException("attribute \"" + attributeName + "\" not in element");
}
print(' ', BOTH);
print(attributeName, AFTER);
print('=', AFTER);
print('"');
print(quoted(attributeValue));
print('"', AFTER);
return this;
}
public MLStreamWriter at(String attributeName, int value) {
return at(attributeName, String.valueOf(value));
}
public MLStreamWriter CR() {
closeIfOpen();
out.println();
return this;
}
/*public MLStreamWriter comment() {
closeIfOpen();
print("<!--");
CR();
return this;
}
public MLStreamWriter endComment() {
print("-->");
return this;
}
*/
public MLStreamWriter tx(String text) {
closeIfOpen();
print(quoted(text));
return this;
}
final public MLStreamWriter tx(char text) {
return tx(String.valueOf(text));
}
final public MLStreamWriter tx(int text) {
return tx(String.valueOf(text));
}
final public MLStreamWriter tx16(String text) {
return tx(hex(text));
}
final public MLStreamWriter tx16(char text) {
return tx(hex(text));
}
final public MLStreamWriter tx16(int text) {
return tx(hex(text));
}
public MLStreamWriter cl(String closingElement) {
closeIfOpen();
String lastElement = (String)stack.remove(stack.size()-1);
if (closingElement != null && !closingElement.equals(lastElement)) {
throw new IllegalArgumentException("mismatch when closing \"" + closingElement
+ "\", current active element is \"" + lastElement + "\"");
}
if (lastElement.equals("!--")) {// hack for XML/HTML
print("-->",BEFORE+FORCE);
} else {
print("</");
print(lastElement);
print('>',BEFORE);
}
return this;
}
final public MLStreamWriter cl() {
return cl(null);
}
public MLStreamWriter closeAllElements() {
for (int i = stack.size()-1; i >= 0; --i) {
cl(null);
}
return this;
}
// stream stuff
public void write(char[] source, int start, int len) {
closeIfOpen();
// later make more efficient!!
out.print(quoted(new String(source, start, len)));
}
public void close() {
closeAllElements();
out.close();
}
public void flush() {
out.flush();
}
// Utility methods
final public MLStreamWriter cell(String ch, String type, String codepoint, String cat) {
if (codepoint == null) codepoint = ch;
int dotpos = type.indexOf('.');
if (dotpos == -1) el(type);
else {
el(type.substring(0,dotpos));
at("class",type.substring(dotpos+1));
}
/*
if (color == -1) {
el("th");
} else {
el("td");
if (color != 0xFFFFFF) {
at("bgcolor","#"+hex(color,6));
}
}
*/
tx(ch).el("br").el("tt").tx16(codepoint);
if (cat != null) tx(" ").tx(cat);
cl().cl().cl();
return this;
}
final public MLStreamWriter cell(String ch) {
return cell(ch,"td",null,null);
}
final public MLStreamWriter cell(String ch, String type) {
return cell(ch,type,null,null);
}
final public MLStreamWriter cell(String ch, String type, String codepoint) {
return cell(ch,type,codepoint,null);
}
static public String hex(int i, int width) {
String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase();
return "00000000".substring(result.length(),width) + result;
}
/**
* Supplies a zero-padded hex representation of an integer (without 0x)
*/
static public String hex(int i) {
return hex(i,8);
}
/**
* Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
*/
static public String hex(char i) {
return hex(i,4);
}
/**
* Supplies a zero-padded hex representation of a Unicode String (without 0x, \\u)
*@param sep can be used to give a sequence, e.g. hex("ab", ",") gives "0061,0062"
*/
static public String hex(String s, String sep) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < s.length(); ++i) {
if (i != 0) result.append(sep);
result.append(hex(s.charAt(i)));
}
return result.toString();
}
static public String hex(String s) {
return hex(s," ");
}
public void author(String name, String url) {
el("font").at("size","-3").tx("[").el("a").at("href",url).tx(name).cl("a").el("script").el("!--");
tx("document.write(', ', document.lastModified);");
cl("!--").cl("script").tx("]").cl("font");
}
// ================== PRIVATES =================
PrintWriter out;
boolean isHTML;
ArrayList stack = new ArrayList();
boolean inElement = false;
Normalizer formC = new Normalizer(Normalizer.NFC);
int len;
int maxLineLength = 60;
// later, add better line end management, indenting
static final int NONE=0, BEFORE=1, AFTER=2, BOTH=3, FORCE = 4; // chosen for bits!!
final void print(String s) {
print(s,NONE);
}
final void print(char c) {
print(c,NONE);
}
final void print(String s, int doesBreak) {
if ((doesBreak & BEFORE) != 0) tryBreak(s.length(), doesBreak);
len += s.length();
out.print(s);
if ((doesBreak & AFTER) != 0) tryBreak(0, doesBreak);
}
final void print(char c, int doesBreak) {
if ((doesBreak & BEFORE) != 0) tryBreak(1, doesBreak);
++len;
out.print(c);
if ((doesBreak & AFTER) != 0) tryBreak(0, doesBreak);
}
void tryBreak(int toAdd, int doesBreak) {
if ((doesBreak & FORCE) != 0 || (len + toAdd) > maxLineLength) {
out.println();
len = stack.size();
for (int i = 0; i < len; ++i) out.print(' ');
}
}
public String quoted(String source) {
source = formC.normalize(source);
StringBuffer result = new StringBuffer();
for (int i = 0; i < source.length(); ++i) {
char ch = source.charAt(i);
switch(ch) {
case '\'':
if (!isHTML) {
result.append("&apos;");
} else {
result.append(ch);
}
break;
case '\"':
result.append("&quot;");
break;
case '<':
result.append("&lt;");
break;
case '&':
result.append("&amp;");
break;
case '>':
result.append("&gt;");
break;
case '\n': case '\r': case '\t':
result.append(ch);
break;
default: if (ch < ' ' // do surrogates later
|| ch >= '\u007F' && ch <= '\u009F'
|| ch >= '\uD800' && ch <= '\uDFFF'
|| ch >= '\uFFFE') {
result.append('\uFFFD');
} else {
result.append(ch);
}
break;
}
}
return result.toString();
}
}

View file

@ -0,0 +1,31 @@
package com.ibm.text.UCD;
import java.io.*;
class MyFloatLister extends PropertyLister {
private float propMask;
public MyFloatLister(UCD ucd, float f, PrintStream output) {
this.propMask = f;
this.output = output;
this.ucdData = ucd;
}
public String propertyName(int cp) {
return ""+ucdData.getNumericValue(cp);
}
public String optionalName(int cp) {
return ucdData.getNumericTypeID(cp);
}
public byte status(int cp) {
//if ((cp & 0xFFF) == 0) System.out.println("# " + Utility.hex(cp));
if (!ucdData.isRepresented(cp)) {
if (ucdData.mapToRepresentative(cp, false) != cp) return PropertyLister.CONTINUE;
return PropertyLister.CONTINUE;
}
if (ucdData.getCategory(cp) == Cn) return PropertyLister.CONTINUE;
return ucdData.getNumericValue(cp) == propMask ? INCLUDE : EXCLUDE;
}
}

View file

@ -0,0 +1,270 @@
package com.ibm.text.UCD;
import java.io.*;
import com.ibm.text.utility.*;
final class MyPropertyLister extends PropertyLister {
static final boolean BRIDGE = false;
private int propMask;
public MyPropertyLister(UCD ucd, int propMask, PrintStream output) {
this.propMask = propMask;
this.output = output;
this.ucdData = ucd;
if (propMask < COMBINING_CLASS) usePropertyComment = false; // skip gen cat
}
static String getCombiningName (int propMask) {
String s = "";
switch (propMask & 0xFF) {
case 0: s = "NotReordered"; break;
case 1: s = "Overlay"; break;
case 7: s = "Nukta"; break;
case 8: s = "KanaVoicing"; break;
case 9: s = "Virama"; break;
case 202: s = "AttachedBelowLeft"; break;
case 204: s = "AttachedBelow"; break;
case 206: s = "AttachedBelowRight"; break;
case 208: s = "AttachedLeft"; break;
case 210: s = "AttachedRight"; break;
case 212: s = "AttachedAboveLeft"; break;
case 214: s = "AttachedAbove"; break;
case 216: s = "AttachedAboveRight"; break;
case 218: s = "BelowLeft"; break;
case 220: s = "Below"; break;
case 222: s = "BelowRight"; break;
case 224: s = "Left"; break;
case 226: s = "Right"; break;
case 228: s = "AboveLeft"; break;
case 230: s = "Above"; break;
case 232: s = "AboveRight"; break;
case 233: s = "DoubleBelow"; break;
case 234: s = "DoubleAbove"; break;
case 240: s = "IotaSubscript"; break;
}
return s;
}
public String headerString() {
int main = (propMask & 0xFF00);
if (main == COMBINING_CLASS) {
String s = getCombiningName(propMask);
if (s.length() == 0) s = "Other Combining Class";
return "# " + s;
} else if (main == BINARY_PROPERTIES) {
return "# Binary Property";
} else if (main == JOINING_GROUP) {
return "";
} else {
String shortID = getUnifiedBinaryPropertyID(ucdData, propMask, SHORT);
String longID = getUnifiedBinaryPropertyID(ucdData, propMask, LONG);
return "# " + shortID + (shortID.equals(longID) ? "" : "\t(" + longID + ")");
}
}
public String propertyName(int cp) {
return getUnifiedBinaryPropertyID(propMask);
}
public String optionalComment(int cp) {
if (propMask < COMBINING_CLASS) return ""; // skip gen cat
int cat = ucdData.getCategory(cp);
if (cat == Lt || cat == Ll || cat == Lu) return "L&";
return ucdData.getCategoryID(cp);
}
/*
public String optionalName(int cp) {
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
return Utility.hex(ucdData.getDecompositionMapping(cp));
} else {
return "";
}
}
*/
public byte status(int cp) {
//if (cp == 0xFFFF) {
// System.out.println("# " + Utility.hex(cp));
//}
byte cat = ucdData.getCategory(cp);
//if (cp == 0x0385) {
// System.out.println(Utility.hex(firstRealCp));
//}
if (cat == Cn
&& propMask != (BINARY_PROPERTIES | Noncharacter_Code_Point)
&& propMask != (BINARY_PROPERTIES | Reserved_Cf_Code_Point)
&& propMask != (CATEGORY | Cn)) {
if (BRIDGE) return CONTINUE;
else return EXCLUDE;
}
boolean inSet = getUnifiedBinaryProperty(cp, propMask);
/*
if (cp >= 0x1D400 && cp <= 0x1D7C9 && cat != Cn) {
if (propMask == (SCRIPT | LATIN_SCRIPT)) inSet = cp <= 0x1D6A3;
else if (propMask == (SCRIPT | GREEK_SCRIPT)) inSet = cp > 0x1D6A3;
}
*/
/* HACK
1D400;MATHEMATICAL BOLD CAPITAL A;Lu;0;L;<font> 0041;;;;N;;;;;
1D6A3;MATHEMATICAL MONOSPACE SMALL Z;Ll;0;L;<font> 007A;;;;N;;;;;
1D6A8;MATHEMATICAL BOLD CAPITAL ALPHA;Lu;0;L;<font> 0391;;;;N;;;;;
1D7C9;MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL;Ll;0;L;<font> 03D6;;;;N;;;;;
*/
if (!inSet) return EXCLUDE;
return INCLUDE;
}
/**
* @return unified property number
*/
public static boolean isUnifiedBinaryPropertyDefined(UCD ucd, int propMask) {
int enum = propMask >> 8;
propMask &= 0xFF;
switch (enum) {
case CATEGORY>>8: return propMask != UNUSED_CATEGORY && propMask < LIMIT_CATEGORY;
case COMBINING_CLASS>>8: return ucd.isCombiningClassUsed((byte)propMask);
case BIDI_CLASS>>8: return propMask != BIDI_UNUSED && propMask < LIMIT_BIDI_CLASS;
case DECOMPOSITION_TYPE>>8: return propMask < LIMIT_DECOMPOSITION_TYPE;
case NUMERIC_TYPE>>8: return propMask < LIMIT_NUMERIC_TYPE;
case EAST_ASIAN_WIDTH>>8: return propMask < LIMIT_EAST_ASIAN_WIDTH;
case LINE_BREAK>>8: return propMask < LIMIT_LINE_BREAK;
case JOINING_TYPE>>8: return propMask < LIMIT_JOINING_TYPE;
case JOINING_GROUP>>8: return propMask < LIMIT_JOINING_GROUP;
case BINARY_PROPERTIES>>8: return propMask < LIMIT_BINARY_PROPERTIES;
case SCRIPT>>8: return propMask != UNUSED_SCRIPT && propMask < LIMIT_SCRIPT;
case AGE>>8: return propMask < LIMIT_AGE;
default: return false;
}
}
public boolean getUnifiedBinaryProperty(int cp, int propMask) {
return getUnifiedBinaryProperty(ucdData, cp, propMask);
}
static public boolean getUnifiedBinaryProperty(UCD ucd, int cp, int propMask) {
int enum = propMask >> 8;
propMask &= 0xFF;
switch (enum) {
case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break;
return ucd.getCategory(cp) == propMask;
case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break;
return ucd.getCombiningClass(cp) == propMask;
case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break;
return ucd.getBidiClass(cp) == propMask;
case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break;
return ucd.getDecompositionType(cp) == propMask;
case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break;
return ucd.getNumericType(cp) == propMask;
case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break;
return ucd.getEastAsianWidth(cp) == propMask;
case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break;
return ucd.getLineBreak(cp) == propMask;
case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break;
return ucd.getJoiningType(cp) == propMask;
case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break;
return ucd.getJoiningGroup(cp) == propMask;
case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break;
return ucd.getBinaryProperty(cp, propMask);
case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break;
return ucd.getScript(cp) == propMask;
case AGE>>8: if (propMask >= LIMIT_AGE) break;
return ucd.getAge(cp) == propMask;
}
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
}
static final int SHORT = -1, NORMAL = 0, LONG = 1, BOTH = 2;
public String getUnifiedBinaryPropertyID(int unifiedPropMask) {
return getUnifiedBinaryPropertyID(ucdData, unifiedPropMask, NORMAL);
}
public static String getUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask) {
String longOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, LONG);
String shortOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, SHORT);
if (longOne.equals(shortOne)) return longOne;
return shortOne + "(" + longOne + ")";
}
public static String getFullUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask, int style) {
String pre = "";
if ((unifiedPropMask & 0xFF00) != BINARY_PROPERTIES) {
String preShort = UCD_Names.ABB_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "=";
String preLong = UCD_Names.SHORT_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "=";
if (style < LONG) pre = preShort;
else if (style == LONG || preShort.equals(preLong)) pre = preLong;
else pre = preShort + "(" + preLong + ")";
}
String shortOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, SHORT);
if (shortOne.length() == 0) shortOne = "xx";
String longOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, LONG);
if (longOne.length() == 0) longOne = "none";
String post;
if (style < LONG) post = shortOne;
else if (style == LONG || shortOne.equals(longOne)) post = longOne;
else post = shortOne + "(" + longOne + ")";
if (pre.length() == 0) {
pre = post + "=";
post = "T";
}
return pre + post;
}
public static String getUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask, int style) {
int enum = unifiedPropMask >> 8;
byte propMask = (byte)unifiedPropMask;
switch (enum) {
case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break;
if (style != LONG) return ucd.getCategoryID_fromIndex(propMask);
return UCD_Names.LONG_GC[propMask];
case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break;
String s = "";
if (style == LONG) {
s = getCombiningName(unifiedPropMask);
if (s.length() != 0) return s;
s = "fixed_";
}
return s + ucd.getCombiningClassID_fromIndex((short)(0xFF & propMask));
case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break;
if (style != LONG) return ucd.getBidiClassID_fromIndex(propMask);
return UCD_Names.LONG_BC[propMask];
case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break;
if (style != SHORT) return ucd.getDecompositionTypeID_fromIndex(propMask);
return UCD_Names.SHORT_DT[propMask];
case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break;
if (style != SHORT) return ucd.getNumericTypeID_fromIndex(propMask);
return UCD_Names.SHORT_NT[propMask];
case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break;
if (style != LONG) return ucd.getEastAsianWidthID_fromIndex(propMask);
return UCD_Names.SHORT_EA[propMask];
case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break;
if (style != LONG) return ucd.getLineBreakID_fromIndex(propMask);
return UCD_Names.LONG_LB[propMask];
case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break;
if (style != LONG) return ucd.getJoiningTypeID_fromIndex(propMask);
return UCD_Names.LONG_JOINING_TYPE[propMask];
case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break;
return ucd.getJoiningGroupID_fromIndex(propMask);
case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break;
if (style != SHORT) return ucd.getBinaryPropertiesID_fromIndex(propMask);
return UCD_Names.SHORT_BP[propMask];
case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break;
if (style != SHORT) return ucd.getScriptID_fromIndex(propMask);
return UCD_Names.ABB_SCRIPT[propMask];
case AGE>>8: if (propMask >= LIMIT_AGE) break;
return ucd.getAgeID_fromIndex(propMask);
}
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
}
}

View file

@ -0,0 +1,475 @@
package com.ibm.text.UCD;
import java.util.*;
import com.ibm.text.*;
import com.ibm.text.utility.*;
/**
* Implements Unicode Normalization Forms C, D, KC, KD.<br>
* See UTR#15 for details.<br>
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
* The Unicode Consortium makes no expressed or implied warranty of any
* kind, and assumes no liability for errors or omissions.
* No liability is assumed for incidental and consequential damages
* in connection with or arising out of the use of the information here.
* @author Mark Davis
*/
public final class Normalizer implements UCD_Types {
public static final String copyright =
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
public static boolean SHOW_PROGRESS = false;
/**
* Create a normalizer for a given form.
*/
public Normalizer(byte form, String unicodeVersion) {
this.composition = (form & COMPOSITION_MASK) != 0;
this.compatibility = (form & COMPATIBILITY_MASK) != 0;
this.data = getData(unicodeVersion);
}
/**
* Create a normalizer for a given form.
*/
public Normalizer(byte form) {
this(form,"");
}
/**
* Masks for the form selector
*/
public static final byte
COMPATIBILITY_MASK = 1,
COMPOSITION_MASK = 2;
/**
* Normalization Form Selector
*/
public static final byte
NFD = 0 ,
NFKD = COMPATIBILITY_MASK,
NFC = COMPOSITION_MASK,
NFKC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
/**
* Normalizes text according to the chosen form,
* replacing contents of the target buffer.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
public StringBuffer normalize(String source, StringBuffer target) {
// First decompose the source into target,
// then compose if the form requires.
if (source.length() != 0) {
internalDecompose(source, target);
if (composition) {
internalCompose(target);
}
}
return target;
}
/**
* Normalizes text according to the chosen form
* @param source the original text, unnormalized
* @return target the resulting normalized text
*/
public String normalize(String source) {
return normalize(source, new StringBuffer()).toString();
}
/**
* Normalizes text according to the chosen form
* @param source the original text, unnormalized
* @return target the resulting normalized text
*/
public String normalize(int cp) {
return normalize(UTF16.valueOf(cp));
}
/**
*/
private StringBuffer hasDecompositionBuffer = new StringBuffer();
public boolean hasDecomposition(int cp) {
hasDecompositionBuffer.setLength(0);
normalize(UTF16.valueOf(cp), hasDecompositionBuffer);
if (hasDecompositionBuffer.length() != 1) return true;
return cp != hasDecompositionBuffer.charAt(0);
}
/**
* Does a quick check to see if the string is in the current form. Checks canonical order and
* isAllowed().
* @param source source text
* @return YES, NO, MAYBE
*/
/*
public static final int NO = 0, YES = 1, MAYBE = -1;
public int quickCheck(String source) {
short lastCanonicalClass = 0;
int result = YES;
for (int i = 0; i < source.length(); ++i) {
char ch = source.charAt(i);
short canonicalClass = data.getCanonicalClass(ch);
if (lastCanonicalClass > canonicalClass && canonicalClass != 0) {
return NO;
}
int check = isAllowed(ch);
if (check == NO) return NO;
if (check == MAYBE) result = MAYBE;
}
return result;
}
/**
* Find whether the given character is allowed in the current form.
* @return YES, NO, MAYBE
*/
/*
public int isAllowed(char ch) {
if (composition) {
if (compatibility) {
if (data.isCompatibilityExcluded(ch)) {
return NO;
}
} else {
if (data.isExcluded(ch)) {
return NO;
}
}
if (data.isTrailing(ch)) {
return MAYBE;
}
} else { // decomposition: both NFD and NFKD
if (data.normalizationDiffers(compatibility,ch)) return NO;
}
return YES;
}
/**
* Utility: Gets the combining class of a character from the
* Unicode Character Database. Only a byte is needed, but since they are signed in Java
* return an int to forstall problems.
* @param ch the source character
* @return value from 0 to 255
*/
public short getCanonicalClass(char ch) {
return data.getCanonicalClass(ch);
}
/**
* Utility: Checks whether there is a recursive decomposition of a character from the
* Unicode Character Database. It is compatibility or canonical according to the particular
* normalizer.
* @param ch the source character
*/
public boolean normalizationDiffers(int ch) {
return data.normalizationDiffers(ch, composition, compatibility);
}
/**
* Utility: Gets recursive decomposition of a character from the
* Unicode Character Database.
* @param compatibility If false selects the recursive
* canonical decomposition, otherwise selects
* the recursive compatibility AND canonical decomposition.
* @param ch the source character
* @param buffer buffer to be filled with the decomposition
*/
public void getRecursiveDecomposition(char ch, StringBuffer buffer) {
data.getRecursiveDecomposition(ch, buffer, compatibility);
}
/**
* Utility: Gets composition mapping.
* @return IntEnumeration with the pair -> value mapping, where the
* pair is firstChar << 16 | secondChar.
* Will need to be fixed for surrogates.
*/
/*
public IntHashtable.IntEnumeration getComposition() {
return data.getComposition();
}
*/
public boolean isTrailing(int cp) {
return this.composition ? data.isTrailing(cp) : false;
}
// ======================================
// PRIVATES
// ======================================
/**
* The current form.
*/
private boolean composition;
private boolean compatibility;
/**
* Decomposes text, either canonical or compatibility,
* replacing contents of the target buffer.
* @param form the normalization form. If COMPATIBILITY_MASK
* bit is on in this byte, then selects the recursive
* compatibility decomposition, otherwise selects
* the recursive canonical decomposition.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
private void internalDecompose(String source, StringBuffer target) {
StringBuffer buffer = new StringBuffer();
int ch32;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
buffer.setLength(0);
ch32 = UTF16.charAt(source, i);
data.getRecursiveDecomposition(ch32, buffer, compatibility);
// add all of the characters in the decomposition.
// (may be just the original character, if there was
// no decomposition mapping)
int ch;
for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
ch = UTF16Plus.charAt(buffer, j);
int chClass = data.getCanonicalClass(ch);
int k = target.length(); // insertion point
if (chClass != 0) {
// bubble-sort combining marks as necessary
int ch2;
for (; k > 0; k -= UTF16.getCharCount(ch2)) {
ch2 = UTF16Plus.charAt(target, k-1);
if (data.getCanonicalClass(ch2) <= chClass) break;
}
}
target.insert(k, UTF16.valueOf(ch));
}
}
}
/**
* Composes text in place. Target must already
* have been decomposed.
* Uses UTF16, which is a utility class for supplementary character support in Java.
* @param target input: decomposed text.
* output: the resulting normalized text.
*/
private void internalCompose(StringBuffer target) {
int starterPos = 0;
int starterCh = UTF16Plus.charAt(target,0);
int compPos = UTF16.getCharCount(starterCh); // length of last composition
int lastClass = data.getCanonicalClass(starterCh);
if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
int oldLen = target.length();
// Loop on the decomposed characters, combining where possible
int ch;
for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
ch = UTF16Plus.charAt(target, decompPos);
if (SHOW_PROGRESS) System.out.println(Utility.hex(target)
+ ", decompPos: " + decompPos
+ ", compPos: " + compPos
+ ", ch: " + Utility.hex(ch)
);
int chClass = data.getCanonicalClass(ch);
int composite = data.getPairwiseComposition(starterCh, ch);
if (composite != data.NOT_COMPOSITE
&& (lastClass < chClass || lastClass == 0)) {
UTF16.setCharAt(target, starterPos, composite);
// we know that we will only be replacing non-supplementaries by non-supplementaries
// so we don't have to adjust the decompPos
starterCh = composite;
} else {
if (chClass == 0) {
starterPos = compPos;
starterCh = ch;
}
lastClass = chClass;
UTF16.setCharAt(target, compPos, ch);
if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
System.out.println("ADJUSTING: " + Utility.hex(target));
decompPos += target.length() - oldLen;
oldLen = target.length();
}
compPos += UTF16.getCharCount(ch);
}
}
target.setLength(compPos);
}
static class Stub {
private UCD ucd;
private HashMap compTable = new HashMap();
private BitSet isSecond = new BitSet();
private BitSet canonicalRecompose = new BitSet();
private BitSet compatibilityRecompose = new BitSet();
static final int NOT_COMPOSITE = 0xFFFF;
Stub(String version) {
ucd = UCD.make(version);
for (int i = 0; i < 0x10FFFF; ++i) {
if (!ucd.isAssigned(i)) continue;
if (ucd.isPUA(i)) continue;
if (ucd.isTrailingJamo(i)) isSecond.set(i);
byte dt = ucd.getDecompositionType(i);
if (dt != CANONICAL) continue;
if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
try {
String s = ucd.getDecompositionMapping(i);
int len = UTF16.countCodePoint(s);
if (len != 2) {
if (len > 2) throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
continue;
}
int a = UTF16.charAt(s, 0);
if (ucd.getCombiningClass(a) != 0) continue;
int b = UTF16.charAt(s, UTF16.getCharCount(a));
isSecond.set(b);
// have a recomposition, so set the bit
canonicalRecompose.set(i);
// set the compatibility recomposition bit
// ONLY if the component characters
// don't compatibility decompose
if (ucd.getDecompositionType(a) <= CANONICAL
&& ucd.getDecompositionType(b) <= CANONICAL) {
compatibilityRecompose.set(i);
}
long key = (((long)a)<<32) | b;
/*if (i == '\u1E0A' || key == 0x004400000307) {
System.out.println(Utility.hex(s));
System.out.println(Utility.hex(i));
System.out.println(Utility.hex(key));
}*/
compTable.put(new Long(key), new Integer(i));
} catch (Exception e) {
throw new ChainException("Error: {0}", new Object[]{ucd.toString(i)}, e);
}
}
}
// process compatibilityRecompose
// have to do this afterwards, since we don't know whether the pieces
// are allowable until we have processed all the characters
/*
Iterator it = compTable.keySet().iterator();
while (it.hasNext()) {
Long key = (Long)it.next();
int cp = compTable.get(key);
long keyLong = key.longValue();
int first = (int)(keyLong >>> 32);
int second = (int)keyLong;
if (ucd.
*/
}
/*
Problem: differs: true, call: false U+0385 GREEK DIALYTIKA TONOS
Problem: differs: true, call: false U+03D3 GREEK UPSILON WITH ACUTE AND HOOK SYMBOL
Problem: differs: true, call: false U+03D4 GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL
Problem: differs: true, call: false U+1E9B LATIN SMALL LETTER LONG S WITH DOT ABOVE
Problem: differs: true, call: false U+1FC1 GREEK DIALYTIKA AND PERISPOMENI
Problem: differs: true, call: false U+1FCD GREEK PSILI AND VARIA
Problem: differs: true, call: false U+1FCE GREEK PSILI AND OXIA
Problem: differs: true, call: false U+1FCF GREEK PSILI AND PERISPOMENI
Problem: differs: true, call: false U+1FDD GREEK DASIA AND VARIA
Problem: differs: true, call: false U+1FDE GREEK DASIA AND OXIA
Problem: differs: true, call: false U+1FDF GREEK DASIA AND PERISPOMENI
Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
*/
short getCanonicalClass(int cp) {
return ucd.getCombiningClass(cp);
}
boolean isTrailing(int cp) {
return isSecond.get(cp);
}
boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) {
byte dt = ucd.getDecompositionType(cp);
if (!composition) {
if (compatibility) return dt >= CANONICAL;
else return dt == CANONICAL;
} else {
// almost the same, except that we add back in the characters
// that RECOMPOSE
if (compatibility) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
else return dt == CANONICAL && !canonicalRecompose.get(cp);
}
}
public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compatibility) {
byte dt = ucd.getDecompositionType(cp);
// we know we decompose all CANONICAL, plus > CANONICAL if compatibility is TRUE.
if (dt == CANONICAL || dt > CANONICAL && compatibility) {
String s = ucd.getDecompositionMapping(cp);
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
getRecursiveDecomposition(cp, buffer, compatibility);
}
} else {
UTF16.append(buffer, cp);
}
}
int getPairwiseComposition(int starterCh, int ch) {
int hangulPoss = UCD.composeHangul(starterCh, ch);
if (hangulPoss != 0xFFFF) return hangulPoss;
Object obj = compTable.get(new Long((((long)starterCh)<<32) | ch));
if (obj == null) return 0xFFFF;
return ((Integer)obj).intValue();
}
}
/**
* Contains normalization data from the Unicode Character Database.
* use false for the minimal set, true for the real set.
*/
private Stub data;
private static HashMap versionCache = new HashMap();
private static Stub getData (String version) {
if (version.length() == 0) version = UCD.latestVersion;
Stub result = (Stub)versionCache.get(version);
if (result == null) {
result = new Stub(version);
versionCache.put(version, result);
}
return result;
}
/**
* Just accessible for testing.
*/
/*
boolean isExcluded (char ch) {
return data.isExcluded(ch);
}
/**
* Just accessible for testing.
*/
/*
String getRawDecompositionMapping (char ch) {
return data.getRawDecompositionMapping(ch);
}
//*/
}

View file

@ -0,0 +1,203 @@
package com.ibm.text.UCD;
import java.io.*;
import com.ibm.text.utility.*;
abstract public class PropertyLister implements UCD_Types {
static final boolean COMPRESS_NAMES = false;
static final boolean DROP_INDICATORS = true;
protected UCD ucdData;
protected PrintStream output;
protected boolean showOnConsole;
protected boolean usePropertyComment = true;
protected int firstRealCp = -2;
protected int lastRealCp = -2;
protected boolean alwaysBreaks = false; // set to true if property only breaks
public static final byte INCLUDE = 0, BREAK = 1, CONTINUE = 2, EXCLUDE = 3;
/**
* @return status. Also have access to firstRealCp, lastRealCp
*/
abstract public byte status(int cp);
public String headerString() {
return "";
}
public String propertyName(int cp) {
return "";
}
public String optionalName(int cp) {
return "";
}
public String optionalComment(int cp) {
if (!usePropertyComment) return "";
int cat = ucdData.getCategory(cp);
if (cat == Lt || cat == Ll || cat == Lu) return "L&";
return ucdData.getCategoryID(cp);
}
public int minPropertyWidth() {
return 1;
}
public void format(int startCp, int endCp, int realCount) {
try {
String prop = propertyName(startCp);
if (prop.length() > 0) prop = "; " + prop;
String opt = optionalName(startCp);
if (opt.length() > 0) opt = "; " + opt;
String optCom = optionalComment(startCp);
if (optCom.length() > 0) optCom += " ";
String startName = getKenName(startCp);
String line;
String pgap = Utility.repeat(" ", minPropertyWidth() - prop.length() - opt.length());
if (startCp != endCp) {
String endName = getKenName(endCp);
int bridge = endCp - startCp + 1 - realCount;
String count = (bridge == 0) ? "" + realCount : realCount + "/" + bridge;
String countStr = Utility.repeat(" ", 3-count.length()) + "[" + count + "] ";
String gap = Utility.repeat(" ", 12 - width(startCp) - width(endCp));
line = Utility.hex(startCp,4) + ".." + Utility.hex(endCp,4) + gap
+ prop + opt + pgap + " # " + optCom
+ countStr;
if (startName.length() != 0 || endName.length() != 0) {
int com = 0;
if (COMPRESS_NAMES) com = commonInitialWords(startName, endName);
if (com == 0) {
line += startName + ".." + endName;
} else {
line += startName.substring(0,com)
+ "(" + startName.substring(com) + ".." + endName.substring(com) + ")";
}
}
} else {
String gap = alwaysBreaks
? Utility.repeat(" ", 6 - width(startCp))
: Utility.repeat(" ", 14 - width(startCp));
String gap2 = alwaysBreaks
? " "
: " ";
line = Utility.hex(startCp,4) + gap
+ prop + opt + pgap + " # " + optCom + gap2
+ startName;
}
output.println(line);
if (showOnConsole) System.out.println(line);
} catch (Exception e) {
throw new ChainException("Format error {0}, {1}",
new Object[]{new Integer(startCp), new Integer(endCp)}, e);
}
}
int width(int cp) {
return cp <= 0xFFFF ? 4
: cp <= 0xFFFFF ? 5
: 6;
}
String getKenName(int cp) {
String result = ucdData.getName(cp);
if (result == null) return "";
if (DROP_INDICATORS && result.charAt(0) == '<') {
if (cp < 0xFF) return "<control>";
return "";
}
return result;
}
/**
* @return common initial substring length ending with SPACE or HYPHEN-MINUS. 0 if there is none
*/
public static int commonInitialWords(String a, String b) {
if (a.length() > b.length()) {
String temp = a;
a = b;
b = temp;
}
int lastSpace = 0;
for (int i = 0; i < a.length(); ++i) {
char ca = a.charAt(i);
char cb = b.charAt(i);
if (ca != cb) return lastSpace;
if (ca == ' ' || ca == '-') lastSpace = i + 1;
}
if (b.length() == a.length() || b.charAt(a.length()) == ' ' || b.charAt(a.length()) == '-') {
lastSpace = a.length();
}
return lastSpace;
}
public int print() {
int count = 0;
firstRealCp = -1;
byte firstRealCpCat = -1;
lastRealCp = -1;
int realRangeCount = 0;
String header = headerString();
if (header.length() != 0) {
output.println(header);
output.println();
}
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
byte s = status(cp);
if (s == INCLUDE && firstRealCp != -1) {
byte cat = ucdData.getCategory(cp);
if (cat == Lt || cat == Ll) cat = Lu;
if (cat != firstRealCpCat) s = BREAK;
}
switch(s) {
case CONTINUE:
break; // do nothing
case INCLUDE:
if (firstRealCp == -1) {
firstRealCp = cp;
firstRealCpCat = ucdData.getCategory(firstRealCp);
if (firstRealCpCat == Lt || firstRealCpCat == Ll) firstRealCpCat = Lu;
}
lastRealCp = cp;
count++;
realRangeCount++;
break;
case BREAK:
if (firstRealCp != -1) {
format(firstRealCp, lastRealCp, realRangeCount);
}
lastRealCp = firstRealCp = cp;
firstRealCpCat = ucdData.getCategory(firstRealCp);
if (firstRealCpCat == Lt || firstRealCpCat == Ll) firstRealCpCat = Lu;
realRangeCount = 1;
count++;
break;
case EXCLUDE:
if (firstRealCp != -1) {
format(firstRealCp, lastRealCp, realRangeCount);
firstRealCp = -1;
realRangeCount = 0;
}
break;
}
}
if (firstRealCp != -1) {
format(firstRealCp, lastRealCp, realRangeCount);
}
if (count == 0) System.out.println("WARNING -- ZERO COUNT FOR " + header);
output.println();
output.println("# Total code points: " + count);
output.println();
return count;
}
}

View file

@ -0,0 +1,473 @@
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import com.ibm.text.utility.*;
public class TestData implements UCD_Types {
public static void main (String[] args) throws IOException {
System.out.println("START");
ucd = UCD.make();
System.out.println("Loaded UCD " + ucd.getVersion() + " " + (new Date(ucd.getDate())));
checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F");
checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD");
int mask = 0;
if (false) {
generateVerticalSlice(BIDI_CLASS, BIDI_CLASS+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedBidiClass-3.1.1d1.txt");
mask = Utility.setBits(0, DerivedPropertyLister.FC_NFKC_Closure, DerivedPropertyLister.ExpandsOnNFKC);
mask = Utility.clearBit(mask, DerivedPropertyLister.FullCompInclusion);
generateDerived(mask, HEADER_DERIVED, "DerivedNormalizationProperties-3.1.0d1.txt");
generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedEastAsianWidth-3.1.0d1.txt");
generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedGeneralCategory-3.1.0d1.txt");
generateVerticalSlice(COMBINING_CLASS, COMBINING_CLASS+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedCombiningClass-3.1.0d1.txt");
generateVerticalSlice(DECOMPOSITION_TYPE, DECOMPOSITION_TYPE+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedDecompositionType-3.1.0d1.txt");
generateVerticalSlice(NUMERIC_TYPE, NUMERIC_TYPE+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedNumericType-3.1.0d1.txt");
generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedEastAsianWidth-3.1.0d1.txt");
generateVerticalSlice(JOINING_TYPE, JOINING_TYPE+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedJoiningType-3.1.0d1.txt");
generateVerticalSlice(JOINING_GROUP, JOINING_GROUP+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedJoiningGroup-3.1.0d1.txt");
generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES+1, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedBinaryProperties-3.1.0d1.txt");
generateVerticalSlice(LIMIT_ENUM, LIMIT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedNumericValues-3.1.0d1.txt");
mask = Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf);
generateDerived(mask, HEADER_DERIVED, "DerivedCoreProperties-3.1.0d1.txt");
generateVerticalSlice(LINE_BREAK, LINE_BREAK+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedLineBreak-3.1.0d1.txt");
generateVerticalSlice(SCRIPT+1, SCRIPT + NEXT_ENUM, KEEP_SPECIAL, HEADER_SCRIPTS, "Scripts-3.1.0d4.txt");
generateVerticalSlice(BINARY_PROPERTIES + White_space, BINARY_PROPERTIES + Noncharacter_Code_Point + 1,
KEEP_SPECIAL, HEADER_EXTEND, "PropList-3.1.0d5.txt");
writeNormalizerTestSuite("NormalizationTest-3.1.0d1.txt");
}
//generateDerived(Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf),
// HEADER_DERIVED, "DerivedPropData2-3.1.0d1.txt");
//generateVerticalSlice(SCRIPT, SCRIPT+1, KEEP_SPECIAL, "ScriptCommon-3.1.0d1.txt");
//listStrings("LowerCase-3.1.0d1.txt", 0,0);
//generateVerticalSlice(0, LIMIT_ENUM, SKIP_SPECIAL, PROPLIST1, "DerivedPropData1-3.1.0d1.txt");
// AGE stuff
//UCD ucd = UCD.make();
//System.out.println(ucd.getAgeID(0x61));
//System.out.println(ucd.getAgeID(0x2FA1D));
//generateCompExclusions();
System.out.println("END");
}
static Normalizer nfkc = new Normalizer(Normalizer.NFKC);
public static void checkHoffman(String test) {
String result = nfkc.normalize(test);
System.out.println(Utility.hex(test) + " => " + Utility.hex(result));
System.out.println();
show(test, 0);
System.out.println();
show(result, 0);
}
public static void show(String s, int indent) {
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
String cc = " " + ucd.getCombiningClass(cp);
cc = Utility.repeat(" ", 4 - cc.length()) + cc;
System.out.println(Utility.repeat(" ", indent) + ucd.getCode(cp) + cc + " " + ucd.getName(cp));
String decomp = nfkc.normalize(cp);
if (!decomp.equals(UTF32.valueOf32(cp))) {
show(decomp, indent + 4);
}
}
}
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd' 'HH:mm:ss.S' GMT'");
static {
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
}
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
public static String fixFile(String s) {
int len = s.length();
if (!s.endsWith(".txt")) return s;
if (s.charAt(len-6) != 'd') return s;
char c = s.charAt(len-5);
if (c < '0' || '9' < c) return s;
System.out.println("Fixing File Name");
return s.substring(0,len-6) + s.substring(len-4);
}
static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2;
public static void doHeader(String fileName, PrintStream output, int headerChoice) {
output.println("# " + fixFile(fileName));
output.println("#");
if (headerChoice == HEADER_SCRIPTS) {
output.println("# For documentation, see UTR #24: Script Names");
output.println("# http://www.unicode.org/unicode/reports/tr24/");
} else if (headerChoice == HEADER_EXTEND) {
output.println("# Unicode Character Database: Extended Properties");
output.println("# For documentation, see PropList.html");
} else {
output.println("# Unicode Character Database: Derived Property Data");
output.println("# Generated algorithmically from the Unicode Character Database");
output.println("# For documentation, see DerivedProperties.html");
}
output.println("# Date: " + myDateFormat.format(new Date()) + " [MD]");
output.println("# Note: Unassigned and Noncharacter codepoints are omitted,");
output.println("# except when listing Noncharacter or Cn.");
output.println("# ================================================");
output.println();
}
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
ucd = UCD.make("310");
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + fileName));
doHeader(fileName, output, headerChoice);
for (int i = 0; i < 32; ++i) {
if ((bitMask & (1<<i)) == 0) continue;
if (i >= DerivedPropertyLister.LIMIT) break;
System.out.print('.');
output.println("# ================================================");
output.println();
new DerivedPropertyLister(ucd, i, output).print();
}
output.close();
}
/*
public static void listStrings(String file, int type, int subtype) throws IOException {
ucd = UCD.make("310");
UCD ucd30 = UCD.make("300");
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
for (int i = 0; i < 0x10FFFF; ++i) {
if ((i & 0xFFF) == 0) System.out.println("# " + i);
if (!ucd.isRepresented(i)) continue;
if (ucd30.isRepresented(i)) continue;
String string = "";
switch(type) {
case 0: string = ucd.getSimpleLowercase(i);
}
if (UTF32.length32(string) == 1 && UTF32.char32At(string,0) == i) continue;
output.println(Utility.hex(i) + "; C; " + Utility.hex(string) + "; # " + ucd.getName(i));
}
output.close();
}
*/
public static void generateCompExclusions() throws IOException {
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "CompositionExclusionsDelta.txt"));
new CompLister(output).print();
output.close();
}
static class CompLister extends PropertyLister {
UCD oldUCD;
int oldLength = 0;
public CompLister(PrintStream output) {
this.output = output;
ucdData = UCD.make("310");
oldUCD = UCD.make("300");
showOnConsole = true;
}
public String propertyName(int cp) {
return UTF32.length32(ucdData.getDecompositionMapping(cp)) + "";
}
public byte status(int cp) {
if (ucdData.getDecompositionType(cp) == CANONICAL
&& oldUCD.getDecompositionType(cp) != CANONICAL) {
int temp = oldLength;
oldLength = UTF32.length32(ucdData.getDecompositionMapping(cp));
if (temp != oldLength) return BREAK;
return INCLUDE;
}
return EXCLUDE;
}
}
static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
public static void generateVerticalSlice(int startEnum, int endEnum, byte skipSpecial, int headerChoice, String file) throws IOException {
//System.out.println(ucd.toString(0x1E0A));
/*
System.out.println(ucd.getData(0xFFFF));
System.out.println(ucd.getData(0x100000));
System.out.println(ucd.getData(0x100000-1));
System.out.println(ucd.getData(0x100000-2));
System.out.println(ucd.getData(0x100000-3));
if (true) return;
String test2 = ucd.getName(0x2A6D6);
//*/
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
doHeader(file, output, headerChoice);
int last = -1;
for (int i = startEnum; i < endEnum; ++i) {
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|| i == (CATEGORY | UNUSED_CATEGORY)
|| i == (BINARY_PROPERTIES | Non_break)
|| i == (JOINING_TYPE | JT_U)
|| i == (SCRIPT | UNUSED_SCRIPT)
|| i == (JOINING_GROUP | NO_SHAPING)
) continue; // skip zero case
if (skipSpecial == SKIP_SPECIAL
&& i >= (BINARY_PROPERTIES | CompositionExclusion)
&& i < (AGE + NEXT_ENUM)) continue;
if ((last & 0xFF00) != (i & 0xFF00) && (i <= BINARY_PROPERTIES || i >= SCRIPT)) {
output.println();
output.println("# ================================================");
output.println("# " + UCD_Names.UNIFIED_PROPERTIES[i>>8]);
output.println("# ================================================");
output.println();
System.out.println();
System.out.println(UCD_Names.UNIFIED_PROPERTIES[i>>8]);
last = i;
} else {
output.println("# ================================================");
output.println();
}
System.out.print(".");
new MyPropertyLister(ucd, i, output).print();
}
if (endEnum == LIMIT_ENUM) {
output.println();
output.println("# ================================================");
output.println("# Numeric Values (from UnicodeData.txt, field 6/7/8)");
output.println("# ================================================");
output.println();
System.out.println();
System.out.println("@NUMERIC VALUES");
Set floatSet = new TreeSet();
for (int i = 0; i < 0x10FFFF; ++i) {
float nv = ucd.getNumericValue(i);
if (Float.isNaN(nv)) continue;
floatSet.add(new Float(nv));
}
Iterator it = floatSet.iterator();
while(it.hasNext()) {
new MyFloatLister(ucd, ((Float)it.next()).floatValue(), output).print();
output.println();
System.out.print(".");
}
}
output.close();
System.out.println();
}
static UCD ucd;
static public Normalizer formC, formD, formKC, formKD;
static public void writeNormalizerTestSuite(String fileName) throws IOException {
PrintWriter log = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(GEN_DIR + fileName),
"UTF8"),
32*1024));
formC = new Normalizer(Normalizer.NFC);
formD = new Normalizer(Normalizer.NFD);
formKC = new Normalizer(Normalizer.NFKC);
formKD = new Normalizer(Normalizer.NFKD);
log.println("# " + fixFile(fileName));
log.println("#");
log.println("# Normalization Test Suite");
log.println("# Date: " + myDateFormat.format(new Date()) + " [MD]");
log.println("# Format:");
log.println("#");
log.println("# Columns (c1, c2,...) are separated by semicolons");
log.println("# Comments are indicated with hash marks");
log.println("#");
log.println("# CONFORMANCE:");
log.println("# 1. The following invariants must be true for all conformant implementations");
log.println("#");
log.println("# NFC");
log.println("# c2 == NFC(c1) == NFC(c2) == NFC(c3)");
log.println("# c4 == NFC(c4) == NFC(c5)");
log.println("#");
log.println("# NFD");
log.println("# c3 == NFD(c1) == NFD(c2) == NFD(c3)");
log.println("# c5 == NFD(c4) == NFD(c5");
log.println("#");
log.println("# NFKC");
log.println("# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)");
log.println("#");
log.println("# NFKD");
log.println("# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)");
log.println("#");
log.println("# 2. For every assigned Unicode 3.1.0 code point X that is not specifically");
log.println("# listed in Part 1, the following invariants must be true for all conformant");
log.println("# implementations:");
log.println("#");
log.println("# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)");
System.out.println("Writing Part 1");
log.println("#");
log.println("@Part0 # Specific cases");
log.println("#");
for (int j = 0; j < testSuiteCases.length; ++j) {
writeLine(testSuiteCases[j], log, false);
}
System.out.println("Writing Part 2");
log.println("#");
log.println("@Part1 # Character by character test");
log.println("# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.");
log.println("#");
for (int ch = 0; ch < 0x10FFFF; ++ch) {
Utility.dot(ch);
if (!ucd.isAssigned(ch)) continue;
if (ucd.isPUA(ch)) continue;
String cc = UTF32.valueOf32(ch);
writeLine(cc,log, true);
}
Utility.fixDot();
System.out.println("Finding Examples");
String[] example = new String[256];
for (int ch = 0; ch < 0x10FFFF; ++ch) {
Utility.dot(ch);
if (!ucd.isAssigned(ch)) continue;
if (ucd.isPUA(ch)) continue;
int cc = ucd.getCombiningClass(ch);
if (example[cc] == null) example[cc] = UTF32.valueOf32(ch);
}
Utility.fixDot();
System.out.println("Writing Part 3");
log.println("#");
log.println("@Part2 # Canonical Order Test");
log.println("#");
for (int ch = 0; ch < 0x10FFFF; ++ch) {
Utility.dot(ch);
if (!ucd.isAssigned(ch)) continue;
if (ucd.isPUA(ch)) continue;
short c = ucd.getCombiningClass(ch);
if (c == 0) continue;
// add character with higher class, same class, lower class
String sample = "";
for (int i = c+1; i < example.length; ++i) {
if (example[i] == null) continue;
sample += example[i];
break;
}
sample += example[c];
for (int i = c-1; i > 0; --i) {
if (example[i] == null) continue;
sample += example[i];
break;
}
writeLine("a" + sample + UTF32.valueOf32(ch) + "b", log, false);
writeLine("a" + UTF32.valueOf32(ch) + sample + "b", log, false);
}
Utility.fixDot();
log.println("#");
log.println("# END OF FILE");
log.close();
}
static void writeLine(String cc, PrintWriter log, boolean check) {
String c = formC.normalize(cc);
String d = formD.normalize(cc);
String kc = formKC.normalize(cc);
String kd = formKD.normalize(cc);
if (check & cc.equals(c) && cc.equals(d) && cc.equals(kc) && cc.equals(kd)) return;
log.println(
Utility.hex(cc," ") + ";" + Utility.hex(c," ") + ";" + Utility.hex(d," ") + ";"
+ Utility.hex(kc," ") + ";" + Utility.hex(kd," ")
+ "; # ("
+ comma(cc) + "; " + comma(c) + "; " + comma(d) + "; " + comma(kc) + "; " + comma(kd) + "; "
+ ") " + ucd.getName(cc));
}
static StringBuffer commaResult = new StringBuffer();
// not recursive!!!
static final String comma(String s) {
commaResult.setLength(0);
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(i)) {
cp = UTF32.char32At(s, i);
if (ucd.getCategory(cp) == Mn) commaResult.append('\u25CC');
UTF32.append32(commaResult, cp);
}
return commaResult.toString();
}
static final String[] testSuiteCases = {
"\u1E0A",
"\u1E0C",
"\u1E0A\u0323",
"\u1E0C\u0307",
"D\u0307\u0323",
"D\u0323\u0307",
"\u1E0A\u031B",
"\u1E0C\u031B",
"\u1E0A\u031B\u0323",
"\u1E0C\u031B\u0307",
"D\u031B\u0307\u0323",
"D\u031B\u0323\u0307",
"\u00C8",
"\u0112",
"E\u0300",
"E\u0304",
"\u1E14",
"\u0112\u0300",
"\u1E14\u0304",
"E\u0304\u0300",
"E\u0300\u0304",
};
}

View file

@ -0,0 +1,185 @@
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import com.ibm.text.utility.*;
public final class TestNormalization {
static final String DIR = "C:\\Documents and Settings\\Davis\\My Documents\\UnicodeData\\Update 3.0.1\\";
static final boolean SKIP_FILE = true;
static PrintWriter out = null;
static BufferedReader in = null;
static Normalizer nfc;
static Normalizer nfd;
static Normalizer nfkc;
static Normalizer nfkd;
static UCD ucd;
static BitSet charsListed = new BitSet(0x110000);
static int errorCount = 0;
static int lineErrorCount = 0;
static String originalLine = "";
static String lastLine = "";
public static void main(String[] args) throws java.io.IOException {
System.out.println("Creating Normalizers");
ucd = UCD.make("");
nfc = new Normalizer(Normalizer.NFC);
nfd = new Normalizer(Normalizer.NFD);
nfkc = new Normalizer(Normalizer.NFKC);
nfkd = new Normalizer(Normalizer.NFKD);
String x = UTF32.valueOf32(0x10000);
check("NFC", nfc, x);
check("NFD", nfd, x);
check("NFKC", nfkc, x);
check("NFKD", nfkd, x);
out = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream("NormalizationTestLog.txt"),
"UTF8"),
32*1024));
in = new BufferedReader (
new FileReader (DIR + "NormalizationTest.txt"),
32*1024);
try {
String[] parts = new String[10];
System.out.println("Checking files");
int count = 0;
while (true) {
String line = in.readLine();
if ((count++ & 0x3FF) == 0) System.out.println("#LINE: " + line);
if (line == null) break;
originalLine = line;
int pos = line.indexOf('#');
if (pos >= 0) {
line = line.substring(0,pos);
}
line = line.trim();
if (line.length() == 0) continue;
int splitCount = Utility.split(line, ';', parts);
// FIX check splitCount
for (int i = 0; i < splitCount; ++i) {
parts[i] = Utility.fromHex(parts[i]);
}
if (UTF32.length32(parts[0]) == 1) {
int code = UTF32.char32At(parts[0],0);
charsListed.set(code);
if ((code & 0x3FF) == 0) System.out.println("# " + Utility.hex(code));
}
// c2 == NFC(c1) == NFC(c2) == NFC(c3)
errorCount += check("NFCa", nfc, parts[1], parts[0]);
errorCount += check("NFCb", nfc, parts[1], parts[1]);
errorCount += check("NFCc", nfc, parts[1], parts[2]);
// c4 == NFC(c4) == NFC(c5)
errorCount += check("NFCd", nfc, parts[3], parts[3]);
errorCount += check("NFCe", nfc, parts[3], parts[4]);
// c3 == NFD(c1) == NFD(c2) == NFD(c3)
errorCount += check("NFDa", nfd, parts[2], parts[0]);
errorCount += check("NFDb", nfd, parts[2], parts[1]);
errorCount += check("NFDc", nfd, parts[2], parts[2]);
// c5 == NFD(c4) == NFD(c5)
errorCount += check("NFDd", nfd, parts[4], parts[3]);
errorCount += check("NFDe", nfd, parts[4], parts[4]);
// c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
errorCount += check("NFKCa", nfkc, parts[3], parts[0]);
errorCount += check("NFKCb", nfkc, parts[3], parts[1]);
errorCount += check("NFKCc", nfkc, parts[3], parts[2]);
errorCount += check("NFKCd", nfkc, parts[3], parts[3]);
errorCount += check("NFKCe", nfkc, parts[3], parts[4]);
// c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
errorCount += check("NFKDa", nfkd, parts[4], parts[0]);
errorCount += check("NFKDb", nfkd, parts[4], parts[1]);
errorCount += check("NFKDc", nfkd, parts[4], parts[2]);
errorCount += check("NFKDd", nfkd, parts[4], parts[3]);
errorCount += check("NFKDe", nfkd, parts[4], parts[4]);
}
System.out.println("Total errors in file: " + errorCount
+ ", lines: " + lineErrorCount);
errorCount = lineErrorCount = 0;
System.out.println("Checking Missing");
checkMissing();
System.out.println("Total errors in unlisted items: " + errorCount
+ ", lines: " + lineErrorCount);
} finally {
if (in != null) in.close();
if (out != null) out.close();
}
}
static String lastBase = "";
public static int check(String type, Normalizer n, String base, String other) {
try {
String trans = n.normalize(other);
if (!trans.equals(base)) {
String temp = "";
if (!lastLine.equals(originalLine)) {
temp = "// " + originalLine;
lastLine = originalLine;
}
if (!base.equals(lastBase)) {
lastBase = base;
lineErrorCount++;
}
String otherList = "";
if (!base.equals(other)) {
otherList = "(" + ucd.getCodeAndName(other) + ")";
}
out.println("DIFF " + type + ": "
+ ucd.getCodeAndName(base) + " != "
+ type
+ otherList
+ " == " + ucd.getCodeAndName(trans)
+ temp
);
return 1;
}
} catch (Exception e) {
throw new ChainException("DIFF " + type + ": "
+ ucd.getCodeAndName(base) + " != "
+ type + "(" + ucd.getCodeAndName(other) + ")", new Object[]{}, e);
}
return 0;
}
public static int check(String type, Normalizer n, String base) {
return check(type, n, base, base);
}
static void checkMissing() {
for (int missing = 0; missing < 0x100000; ++missing) {
if ((missing & 0xFFF) == 0) System.out.println("# " + Utility.hex(missing));
if (charsListed.get(missing)) continue;
String x = UTF32.valueOf32(missing);
errorCount += check("NFC", nfc, x);
errorCount += check("NFD", nfd, x);
errorCount += check("NFKC", nfkc, x);
errorCount += check("NFKD", nfkd, x);
}
}
}

View file

@ -0,0 +1,226 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<title>Unicode Character Database</title>
<style>
<!--
table { padding: 4 }
td { padding: 4 }
-->
</style>
</head>
<body>
<span class="cb" id style="DISPLAY: block">
<h1 align="center">Unicode Character Database (UCD) in XML Format</h1>
<h1 align="center"><b><font color="#FF0000">WARNING: FORMAT IS DRAFT!</font></b></h1>
<p align="center">MD 2000.10.16</p>
<table border="1" width="40%" align="right" cellspacing="4" cellpadding="0">
<tr>
<td width="100%" bgcolor="#C0C0C0"><span class="cb" id
style="DISPLAY: block">
<h4 align="center">Using Internet Explorer</h4>
<p>The UCD-Main.xml file can be read in Internet Explorer (5.0 and above).
However:</p>
<ul>
<li>It may take a few minutes to load completely.</li>
<li>The XML parser in IE does not appear to be conformant: it seems to
break on</span> the following valid code points (and others):
<ul>
<li>&lt;IEbugs<br>
c1='&amp;#xFFF9;'<br>
c2='&amp;#xFFFA;'<br>
c3='&amp;#xFFFB;'<br>
c4='&amp;#xFFFC;'<br>
c5='&amp;#xFFFD;'<br>
c6='&amp;#xF0000;'<br>
c7='&amp;#xFFFFD;'<br>
c8='&amp;#x100000;'<br>
c9='&amp;#x10FFFD;'/&gt;</li>
</ul>
</li>
</ul>
</td>
</tr>
</table>
<p><a href="UCD-Main.xml">UCD-Main.xml</a> provides an XML format for the main
files in the Unicode Character Database. These include:</p>
<ul>
<li><code>UnicodeData.txt</code></li>
<li><code>ArabicShaping.txt</code></li>
<li><code>Jamo.txt</code></li>
<li><code>SpecialCasing.txt</code></li>
<li><code>CompositionExclusions.txt</code></li>
<li><code>EastAsianWidth.txt</code></li>
<li><code>LineBreak.txt</code></li>
<li><code>BidiMirroring.txt</code></li>
<li><code>CaseFolding.txt</code></li>
<li><code>Blocks.txt</code></li>
<li><code>PropList.alpha.txt</code></li>
</ul>
<p>Other files in the UCD have very different structure or purpose, and are best
expressed with separate files. Some annotational data, such as that in
NamesList.txt or the 10646 comment in UnicodeData, is also best served with
separate files. The current UCD files not yet in XML format are:</p>
<ul>
<li><code>Unihan.txt</code></li>
<li><code>NamesList.txt</code></li>
<li><code>Index.txt</code></li>
<li><code>NormalizationTest.txt</code></li>
</ul>
<h3>Format</h3>
<p>The Unicode blocks are provided as a list of &lt;block .../&gt; elements,
with attributes providing the start, end, and name.</p>
<p>Each assigned code point is a &lt;e .../&gt; element, with attributes
supplying specific properties. The meaning of the attributes is specified below.
There is one exception: large ranges of code points&nbsp; for characters such as
Hangul Syllables are abbreviated by indicating the start and end of the range.</p>
<p>Because of the volume of data, the attribute names are abbreviated. A <a
href="#AttributeAbbreviations">key</a> explains the abbreviations, and relates
them to the fields and values of the original UCD semicolon-delimited files.
With few exceptions, the values in the XML are directly copied from data in the
original UCD semicolon-delimited files. Those exceptions are described <a
href="http://www.unicode.org/Public/3.0-Update1/UnicodeCharacterDatabase-3.0.1.html#DataModifications">below</a>.</p>
<p>Numeric character references (NCRs) are used to encode the Unicode code
points. Some Unicode code points cannot be transmitted in XML, even as NCRs (see
<a href="http://www.w3.org/TR/REC-xml#charsets">http://www.w3.org/TR/REC-xml#charsets</a>),
or would not be visibly distinct (TAB, CR, LF) in the data. Such code points are
represented by '#xX;', where X is a hex number.</p>
<h3><a name="AttributeAbbreviations">Attribute Abbreviations</a></h3>
<p>To reduce the size of the document, the following attribute abbreviations are
used. If an attribute is missing, that means it gets a default value. The
defaults are listed in parentheses below. If there is no specific default, then
a missing attribute should be read as N/A (not applicable). A default with '='
means the default is the value of another other field (recursively!). Thus if
the titlecase attribute is missing, then the value is the same as the uppercase.
If that in turn is missing, then the value is the same as the code point itself.</p>
<p>For a description of the source files, see <a
href="http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html">UnicodeCharacterDatabase.html</a>.
That file also has links to the descriptions of the fields within the files.
Since the PropList values are so long, they will probably also be abbreviated in
the future.</p>
<table border="1" width="100%">
<tr>
<td width="50%" valign="top"><span class="cb" id style="DISPLAY: block">
<h4>UnicodeData</h4>
<p>&nbsp; c: code point<br>
&nbsp; n: name<br>
&nbsp; gc: general category (Lo)<br>
&nbsp; cc: combining class (0)<br>
&nbsp; bc: bidi category (L)<br>
&nbsp; dm: decomposition mapping<br>
&nbsp; dt: decomposition type (canonical)<br>
&nbsp; nt: numeric type<br>
&nbsp; nv: numeric value<br>
&nbsp; bm: bidi mirrored (N)<br>
&nbsp; uc: uppercase (=c)<br>
&nbsp; lc: lowercase (=c)<br>
&nbsp; tc: titlecase (=uc)</p>
<h4>SpecialCasing:</h4>
<p>&nbsp; sl: special lower (=lc)<br>
&nbsp; su: special upper (=uc)<br>
&nbsp; st: special title (=su)<br>
&nbsp; sc: special case condition</p>
<h4>CaseFolding:</h4>
<p>&nbsp; fc: foldcase (=sl)</span></td>
<td width="50%" valign="top"><span class="cb" id style="DISPLAY: block">
<h4>CompositionExclusions:</h4>
<p>&nbsp; ce: composition exclusion (N)</p>
<h4>EastAsianWidth:</h4>
<p>&nbsp; ea: east asian width (N)</p>
<h4>Jamo:</h4>
<p>&nbsp; jn: jamo name</p>
<h4>LineBreak:</h4>
<p>&nbsp; lb: line break class (AL)</p>
<h4>ArabicShaping:</h4>
<p>&nbsp; jt: joining type<br>
&nbsp; jg: joining group</p>
<h4>BidiMirroring:</h4>
<p>&nbsp; bg: bidi mirroring glyph (=c)</p>
<p><b>PropList:</b></p>
<p>&nbsp; xs: space-delimited list of properties from the file</p>
<p><b><i>WARNING: these values are likely to change!</i></b></span></td>
</tr>
</table>
<br>
<h3><a name="DataModifications">Data Modifications</a></h3>
</span>
<p>The XML format is generated from the original semicolon-delimited UCD files.
In general, all fields and values are direct copies. However, there are some
changes, detailed below.</p>
<h4>1. Some redundant or annotational fields are omitted</h4>
<table border="1" width="100%">
<tr>
<td width="50%" valign="top"><b>UnicodeData<br>
</b>1.0 Name<br>
10646 comment<br>
<br>
<b>CaseFolding<br>
</b>Type (since it is computable from whether the fold equals the normal
lowercase)
<p><b>ArabicShaping<br>
</b>Name<br>
<br>
<b>EastAsianWidth<br>
</b>Name<br>
<br>
<b>LineBreak<br>
</b>Name</p>
</td>
<td width="50%" valign="top"><b>PropList</b><font face="Times New Roman"
color="#000000">
<p>The fields are based on the proposed PropList.alpha, which changes the
fields considerably.</p>
</font>
<p><span class="cb" id style="display: block"><b><i>WARNING: other values
are also likely to change!</i></b></span></p>
</td>
</tr>
</table>
<h4>2. Some fields are broken into several fields; others may be combined into a
single field</h4>
<ul>
<li><b>dt: </b>decomposition tag
<ul>
<li>the 'tag' field extracted from the decomposition mapping. If there is
no tag, the value is &quot;canonical&quot;. Only has meaning if there is
a decomposition (<b>dm</b>).</li>
</ul>
</li>
<li><b>nt: </b>numeric type
<ul>
<li>an enumeration [decimal, digit, numeric] for the type of number. It
replaces having duplicate field values for numbers</li>
</ul>
</li>
<li><b>rg: </b>range
<ul>
<li>used for ranges of values that share characteristics, instead of
having to do a substring check.<br>
&quot;START&quot; corresponds to &quot;&lt;..., First&gt;&quot;<br>
&quot;END&quot; corresponds to &quot;&lt;..., Last&gt;&quot;</li>
</ul>
</li>
<li><b>nc: </b>name computed
<ul>
<li>if &quot;COMPUTED&quot;, indicates that the name must be computed:
e.g. Hangul Syllables, Ideographs</li>
</ul>
</li>
<li><b>na: </b>name annotation
<ul>
<li>used for code points that do not really have associated names, like
control characters and private use characters. The data in that case is
either extracted from the &quot;&lt;...&gt;&quot; style name in the old
format, or gotten from the &quot;1.0 Unicode name&quot;.</li>
</ul>
</li>
</ul>
</body>
</html>

View file

@ -0,0 +1,974 @@
package com.ibm.text.UCD;
import java.util.HashMap;
import java.util.BitSet;
import java.util.Map;
import java.io.IOException;
import java.io.DataInputStream;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import com.ibm.text.utility.*;
public final class UCD implements UCD_Types {
/**
* Used for the default version.
*/
public static final String latestVersion = "3.1.1";
/**
* Create singleton instance for default (latest) version
*/
public static UCD make() {
return make("");
}
/**
* Create singleton instance for the specific version
*/
public static UCD make(String version) {
if (version == null || version.length() == 0) version = latestVersion;
UCD result = (UCD)versionCache.get(version);
if (result == null) {
result = new UCD();
result.fillFromFile(version);
versionCache.put(version, result);
}
return result;
}
/**
* Get the version of the UCD
*/
public String getVersion() {
return version;
}
/**
* Get the date that the data was parsed
*/
public long getDate() {
return date;
}
/**
* Is the code point allocated?
*/
public boolean isAllocated(int codePoint) {
if (getCategory(codePoint) != Cn) return true;
if ((codePoint & 0xFFFE) == 0xFFFE) {
if (major < 2 && codePoint > 0xFFFF) return false;
return true; // Noncharacter
}
if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF && major >= 3 && minor >= 1) return true;
return false;
}
/**
* Is the code point assigned to a character (or surrogate)
*/
public boolean isAssigned(int codePoint) {
return getCategory(codePoint) != Cn;
}
/**
* Is the code point a PUA character (fast check)
*/
public boolean isPUA(int codePoint) {
return (codePoint >= 0xE000 && codePoint < 0xF900
|| codePoint >= 0xF0000 && codePoint < 0xFFFFE
|| codePoint >= 0x100000 && codePoint < 0x10FFFE);
}
/**
* Many ranges are elided in the UCD. All but the first are not actually
* represented in the data internally. This detects such cases.
*/
public boolean isRepresented(int codePoint) {
return getRaw(codePoint) != null;
}
/**
* Return XML version of the data associated with the code point.
*/
public String toString(int codePoint) {
return get(codePoint, true).toString(FULL);
}
/**
* Get the character name.
*/
public String getName(int codePoint) {
return get(codePoint, true).name;
}
/**
* Get the character names for the code points in a string, separated by ", "
*/
public String getName(String s) {
if (s.length() == 1) return get(s.charAt(0), true).name;
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
if (i > 0) result.append(", ");
result.append(getName(cp));
}
return result.toString();
}
/**
* Get the code in U+ notation
*/
public static String getCode(int codePoint) {
return "U+" + Utility.hex(codePoint);
}
/**
* Get the code in U+ notation
*/
public static String getCode(String s) {
if (s.length() == 1) return getCode(s.charAt(0)); // fast path
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
if (i > 0) result.append(", ");
result.append(getCode(cp));
}
return result.toString();
}
/**
* Get the name and number (U+xxxx NAME) for a code point
*/
public String getCodeAndName(int codePoint) {
return getCode(codePoint) + " " + getName(codePoint);
}
/**
* Get the name and number (U+xxxx NAME) for the code points in a string,
* separated by ", "
*/
public String getCodeAndName(String s) {
if (s == null || s.length() == 0) return "NULL";
if (s.length() == 1) return getCodeAndName(s.charAt(0)); // fast path
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
if (i > 0) result.append(", ");
result.append(getCodeAndName(cp));
}
return result.toString();
}
/**
* Get the general category
*/
public byte getCategory(int codePoint) {
return get(codePoint, false).generalCategory;
}
/**
* Get the main category, as a mask
*/
public static int mainCategoryMask(byte cat) {
switch (cat) {
case Lu: case Ll: case Lt: case Lm: case Lo: return LETTER_MASK;
case Mn: case Me: case Mc: return MARK_MASK;
case Nd: case Nl: case No: return NUMBER_MASK;
case Zs: case Zl: case Zp: return SEPARATOR_MASK;
case Cc: case Cf: case Cs: case Co: return CONTROL_MASK;
case Pc: case Pd: case Ps: case Pe: case Po: case Pi: case Pf: return PUNCTUATION_MASK;
case Sm: case Sc: case Sk: case So: return SYMBOL_MASK;
case Cn: return UNASSIGNED_MASK;
}
throw new IllegalArgumentException ("Illegal General Category " + cat);
}
/**
* Get the combining class, a number between zero and 255. Returned
* as a short to avoid the signed-byte problem in Java
*/
public short getCombiningClass(int codePoint) {
return (short)(get(codePoint, false).combiningClass & 0xFF);
}
/**
* Does this combining class actually occur in this version of the data.
*/
public boolean isCombiningClassUsed(byte value) {
return combiningClassSet.get(0xFF & value);
}
/**
* Get the bidi class
*/
public byte getBidiClass(int codePoint) {
return get(codePoint, false).bidiClass;
}
/**
* Get the RAW decomposition mapping. Must be used recursively for the full mapping!
*/
public String getDecompositionMapping(int codePoint) {
return get(codePoint, true).decompositionMapping;
}
/**
* Get BIDI mirroring character, if there is one.
*/
public String getBidiMirror(int codePoint) {
return get(codePoint, true).bidiMirror;
}
/**
* Get the RAW decomposition type: the <...> field in the UCD data.
*/
public byte getDecompositionType(int codePoint) {
return get(codePoint, false).decompositionType;
}
public float getNumericValue(int codePoint) {
return get(codePoint, false).numericValue;
}
public byte getNumericType(int codePoint) {
return get(codePoint, false).numericType;
}
public String getCase(int codePoint, byte simpleVsFull, byte caseType) {
return getCase(codePoint, simpleVsFull, caseType, "");
}
public String getCase(String s, byte simpleVsFull, byte caseType) {
return getCase(s, simpleVsFull, caseType, "");
}
public String getCase(int codePoint, byte simpleVsFull, byte caseType, String condition) {
UData udata = get(codePoint, true);
if (caseType < LOWER || caseType > FOLD
|| (simpleVsFull != SIMPLE && simpleVsFull != FULL)) {
throw new IllegalArgumentException("simpleVsFull or caseType out of bounds");
}
if (caseType < FOLD) {
if (simpleVsFull == FULL && udata.specialCasing.length() != 0) {
if (condition.length() == 0
|| udata.specialCasing.indexOf(condition) < 0) {
simpleVsFull = SIMPLE;
}
}
} else {
// special case. For these characters alone, use "I" as option meaning collapse to "i"
//if (codePoint == 0x0131 || codePoint == 0x0130) { // special case turkish i
if (getBinaryProperty(codePoint, CaseFoldTurkishI)) {
if (!udata.specialCasing.equals("I")) simpleVsFull = SIMPLE;
else simpleVsFull = FULL;
}
}
switch (caseType + simpleVsFull) {
case SIMPLE + UPPER: return udata.simpleUppercase;
case SIMPLE + LOWER: return udata.simpleLowercase;
case SIMPLE + TITLE: return udata.simpleTitlecase;
case SIMPLE + FOLD: return udata.simpleCaseFolding;
case FULL + UPPER: return udata.fullUppercase;
case FULL + LOWER: return udata.fullLowercase;
case FULL + TITLE: return udata.fullTitlecase;
case FULL + FOLD: return udata.fullCaseFolding;
}
throw new IllegalArgumentException("getCase: " + caseType + ", " + simpleVsFull);
}
public String getCase(String s, byte simpleVsFull, byte caseType, String condition) {
if (UTF32.length32(s) == 1) return getCase(UTF32.char32At(s, 0), simpleVsFull, caseType);
StringBuffer result = new StringBuffer();
int cp;
byte currentCaseType = caseType;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
String mappedVersion = getCase(cp, simpleVsFull, currentCaseType, condition);
result.append(mappedVersion);
if (caseType == TITLE) {
// if letter is cased, change to lowercase, otherwise change to TITLE
byte cat = getCategory(cp);
if (cat == Mn || cat == Me || cat == Mc) {
// ignore!
} else if (cat == Lu || cat == Ll || cat == Lt
|| getBinaryProperty(cp, Other_Lowercase)
|| getBinaryProperty(cp, Other_Uppercase)) {
currentCaseType = LOWER;
} else {
currentCaseType = TITLE;
}
}
}
return result.toString();
}
/*
public String getSimpleLowercase(int codePoint) {
return get(codePoint, true).simpleLowercase;
}
public String getSimpleUppercase(int codePoint) {
return get(codePoint, true).simpleUppercase;
}
public String getSimpleTitlecase(int codePoint) {
return get(codePoint, true).simpleTitlecase;
}
public String getSimpleCaseFolding(int codePoint) {
return get(codePoint, true).simpleCaseFolding;
}
public String getFullLowercase(int codePoint) {
return get(codePoint, true).fullLowercase;
}
public String getFullUppercase(int codePoint) {
return get(codePoint, true).fullUppercase;
}
public String getFullTitlecase(int codePoint) {
return get(codePoint, true).fullTitlecase;
}
public String getFullCaseFolding(int codePoint) {
return get(codePoint, true).simpleCaseFolding;
}
public String getLowercase(int codePoint, boolean full) {
if (full) return getFullLowercase(codePoint);
return getSimpleLowercase(codePoint);
}
public String getUppercase(int codePoint, boolean full) {
if (full) return getFullUppercase(codePoint);
return getSimpleLowercase(codePoint);
}
public String getTitlecase(int codePoint, boolean full) {
if (full) return getFullTitlecase(codePoint);
return getSimpleTitlecase(codePoint);
}
public String getCaseFolding(int codePoint, boolean full) {
if (full) return getFullCaseFolding(codePoint);
return getSimpleCaseFolding(codePoint);
}
public String getLowercase(String s, boolean full) {
if (s.length() == 1) return getLowercase(s.charAt(0), true);
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
if (i > 0) result.append(", ");
result.append(getLowercase(cp, true));
}
return result.toString();
}
public String getUppercase(String s, boolean full) {
if (s.length() == 1) return getUppercase(s.charAt(0), true);
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
if (i > 0) result.append(", ");
result.append(getUppercase(cp, true));
}
return result.toString();
}
public String getTitlecase(String s, boolean full) {
if (s.length() == 1) return getTitlecase(s.charAt(0), true);
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
if (i > 0) result.append(", ");
result.append(getTitlecase(cp, true));
}
return result.toString();
}
public String getCaseFolding(String s, boolean full) {
if (s.length() == 1) return getCaseFolding(s.charAt(0), true);
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
if (i > 0) result.append(", ");
result.append(getCaseFolding(cp, true));
}
return result.toString();
}
*/
public String getSpecialCase(int codePoint) {
return get(codePoint, true).specialCasing;
}
public byte getEastAsianWidth(int codePoint) {
return get(codePoint, false).eastAsianWidth;
}
public byte getLineBreak(int codePoint) {
return get(codePoint, false).lineBreak;
}
public byte getScript(int codePoint) {
return get(codePoint, false).script;
}
public byte getAge(int codePoint) {
return get(codePoint, false).age;
}
public byte getJoiningType(int codePoint) {
return get(codePoint, false).joiningType;
}
public byte getJoiningGroup(int codePoint) {
return get(codePoint, false).joiningGroup;
}
public int getBinaryProperties(int codePoint) {
return get(codePoint, false).binaryProperties;
}
public boolean getBinaryProperty(int codePoint, int bit) {
return (get(codePoint, false).binaryProperties & (1<<bit)) != 0;
}
// ENUM Mask Utilties
public int getCategoryMask(int codePoint) {
return 1<<get(codePoint, false).generalCategory;
}
public int getBidiClassMask(int codePoint) {
return 1<<get(codePoint, false).bidiClass;
}
public int getNumericTypeMask(int codePoint) {
return 1<<get(codePoint, false).numericType;
}
public int getDecompositionTypeMask(int codePoint) {
return 1<<get(codePoint, false).decompositionType;
}
public int getEastAsianWidthMask(int codePoint) {
return 1<<get(codePoint, false).eastAsianWidth;
}
public int getLineBreakMask(int codePoint) {
return 1<<get(codePoint, false).lineBreak;
}
public int getScriptMask(int codePoint) {
return 1<<get(codePoint, false).script;
}
public int getAgeMask(int codePoint) {
return 1<<get(codePoint, false).age;
}
public int getJoiningTypeMask(int codePoint) {
return 1<<get(codePoint, false).joiningType;
}
public int getJoiningGroupMask(int codePoint) {
return 1<<get(codePoint, false).joiningGroup;
}
// VERSIONS WITH NAMES
public String getCategoryID(int codePoint) {
return getCategoryID_fromIndex(getCategory(codePoint));
}
public static String getCategoryID_fromIndex(byte prop) {
return UCD_Names.GC[prop];
}
public String getBidiClassID(int codePoint) {
return getBidiClassID_fromIndex(getBidiClass(codePoint));
}
public static String getBidiClassID_fromIndex(byte prop) {
return UCD_Names.BC[prop];
}
public String getCombiningClassID(int codePoint) {
return getCombiningClassID_fromIndex(getCombiningClass(codePoint));
}
public static String getCombiningClassID_fromIndex(short cc) {
return cc + "";
}
public String getDecompositionTypeID(int codePoint) {
return getDecompositionTypeID_fromIndex(getDecompositionType(codePoint));
}
public static String getDecompositionTypeID_fromIndex(byte prop) {
return UCD_Names.DT[prop];
}
public String getNumericTypeID(int codePoint) {
return getNumericTypeID_fromIndex(getNumericType(codePoint));
}
public static String getNumericTypeID_fromIndex(byte prop) {
return UCD_Names.NT[prop];
}
public String getEastAsianWidthID(int codePoint) {
return getEastAsianWidthID_fromIndex(getEastAsianWidth(codePoint));
}
public static String getEastAsianWidthID_fromIndex(byte prop) {
return UCD_Names.EA[prop];
}
public String getLineBreakID(int codePoint) {
return getLineBreakID_fromIndex(getLineBreak(codePoint));
}
public static String getLineBreakID_fromIndex(byte prop) {
return UCD_Names.LB[prop];
}
public String getJoiningTypeID(int codePoint) {
return getJoiningTypeID_fromIndex(getJoiningType(codePoint));
}
public static String getJoiningTypeID_fromIndex(byte prop) {
return UCD_Names.JOINING_TYPE[prop];
}
public String getJoiningGroupID(int codePoint) {
return getJoiningGroupID_fromIndex(getJoiningGroup(codePoint));
}
public static String getJoiningGroupID_fromIndex(byte prop) {
return UCD_Names.JOINING_GROUP[prop];
}
public String getScriptID(int codePoint) {
return getScriptID_fromIndex(getScript(codePoint));
}
public static String getScriptID_fromIndex(byte prop) {
return UCD_Names.SCRIPT[prop];
}
public String getAgeID(int codePoint) {
return getAgeID_fromIndex(getAge(codePoint));
}
public static String getAgeID_fromIndex(byte prop) {
return UCD_Names.AGE[prop];
}
public String getBinaryPropertiesID(int codePoint, byte bit) {
return (getBinaryProperties(codePoint) & (1<<bit)) != 0 ? "Y" : "N";
}
public static String getBinaryPropertiesID_fromIndex(byte bit) {
return UCD_Names.BP[bit];
}
public static int mapToRepresentative(int ch, boolean old) {
if (ch <= 0xFFFD) {
//if (ch <= 0x2800) return ch;
//if (ch <= 0x28FF) return 0x2800; // braille
if (ch <= 0x3400) return ch; // CJK Ideograph Extension A
if (ch <= 0x4DB5) return 0x3400;
if (ch <= 0x4E00) return ch; // CJK Ideograph
if (ch <= 0x9FA5) return 0x4E00;
if (ch <= 0xAC00) return ch; // Hangul Syllable
if (ch <= 0xD7A3) return 0xAC00;
if (ch <= 0xD800) return ch; // Non Private Use High Surrogate
if (ch <= 0xDB7F) return 0xD800;
if (ch <= 0xDB80) return ch; // Private Use High Surrogate
if (ch <= 0xDBFF) return 0xDB80;
if (ch <= 0xDC00) return ch; // Low Surrogate
if (ch <= 0xDFFF) return 0xDC00;
if (ch <= 0xE000) return ch; // Private Use
if (ch <= 0xF8FF) return 0xE000;
if (old) {
if (ch <= 0xF900) return ch; // CJK Compatibility Ideograp
if (ch <= 0xFA2D) return 0xF900;
}
if (ch < 0xFDD0) return ch; // Noncharacter
if (ch <= 0xFDEF) return 0xFFFF;
} else {
if ((ch & 0xFFFE) == 0xFFFE) return 0xFFFF; // Noncharacter
if (ch <= 0x20000) return ch; // Extension B
if (ch <= 0x2A6D6) return 0x20000;
//if (ch <= 0x2F800) return ch;
//if (ch <= 0x2FA1D) return 0x2F800; // compat ideographs
if (ch <= 0xF0000) return ch; // Plane 15 Private Use
if (ch <= 0xFFFFD) return 0xF0000; // Plane 16 Private Use
if (ch <= 0x100000) return ch; // Plane 15 Private Use
if (ch <= 0x10FFFD) return 0x100000; // Plane 16 Private Use
}
return ch;
}
public boolean isIdentifierStart(int cp, boolean extended) {
if (extended) {
if (cp == 0x0E33 || cp == 0x0EB3 || cp == 0xFF9E || cp == 0xFF9F) return false;
if (cp == 0x037A || cp >= 0xFC5E && cp <= 0xFC63 || cp == 0xFDFA || cp == 0xFDFB) return false;
if (cp >= 0xFE70 && cp <= 0xFE7E && (cp & 1) == 0) return false;
}
byte cat = getCategory(cp);
if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl) return true;
return false;
}
public boolean isIdentifierContinue_NO_Cf(int cp, boolean extended) {
if (isIdentifierStart(cp, extended)) return true;
if (extended) {
if (cp == 0x00B7) return true;
if (cp == 0x0E33 || cp == 0x0EB3 || cp == 0xFF9E || cp == 0xFF9F) return true;
}
byte cat = getCategory(cp);
if (cat == Mn || cat == Mc || cat == Nd || cat == Pc) return true;
return false;
}
public boolean isIdentifier(String s, boolean extended) {
if (s.length() == 0) return false; // at least one!
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
if (i == 0) {
if (!isIdentifierStart(cp, extended)) return false;
} else {
if (!isIdentifierContinue_NO_Cf(cp, extended)) return false;
}
}
return true;
}
/*
Middle Dot. Because most Catalan legacy data will be encoded in Latin-1, U+00B7 MIDDLE DOT needs to be
allowed in <identifier_extend>.
In particular, the following four characters should be in <identifier_extend> and not <identifier_start>:
0E33 THAI CHARACTER SARA AM
0EB3 LAO VOWEL SIGN AM
FF9E HALFWIDTH KATAKANA VOICED SOUND MARK
FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
Irregularly decomposing characters. U+037A GREEK YPOGEGRAMMENI and certain Arabic presentation
forms have irregular compatibility decompositions, and need to be excluded from both <identifier_start>
and <identifier_extend>. It is recommended that all Arabic presentation forms be excluded from identifiers
in any event, although only a few of them are required to be excluded for normalization
to guarantee identifier closure.
*/
// *******************
// PRIVATES
// *******************
// cache of singletons
private static Map versionCache = new HashMap();
private static final int LIMIT_CODE_POINT = 0x110000;
private static final UData[] ALL_NULLS = new UData[1024];
// main data
private UData[][] data = new UData[LIMIT_CODE_POINT>>10][];
// extras
private BitSet combiningClassSet = new BitSet(256);
private String version;
private String file;
private long date = -1;
private byte format = -1;
private byte major = -1;
private byte minor = -1;
private byte update = -1;
private int size = -1;
// cache last UData
private int lastCode = Integer.MIN_VALUE;
private UData lastResult = UData.UNASSIGNED;
private boolean lastCodeFixed = false;
// hide constructor
private UCD() {
for (int i = 0; i < data.length; ++i) {
data[i] = ALL_NULLS;
}
}
private void add(UData uData) {
int high = uData.codePoint>>10;
if (data[high] == ALL_NULLS) {
UData[] temp = new UData[1024];
data[high] = temp;
}
data[high][uData.codePoint & 0x3FF] = uData;
}
public boolean hasComputableName(int codePoint) {
if (codePoint >= 0xF900 && codePoint <= 0xFA2D) return true;
int rangeStart = mapToRepresentative(codePoint, major < 2);
switch (rangeStart) {
default:
return getRaw(codePoint) == null;
case 0x2800: // braille
case 0xF900: // compat ideos
case 0x2F800: // compat ideos
case 0x3400: // CJK Ideograph Extension A
case 0x4E00: // CJK Ideograph
case 0x20000: // Extension B
case 0xAC00: // Hangul Syllable
case 0xE000: // Private Use
case 0xF0000: // Private Use
case 0x100000: // Private Use
case 0xD800: // Surrogate
case 0xDB80: // Private Use
case 0xDC00: // Private Use
case 0xFFFF: // Noncharacter
return true;
}
}
private UData getRaw(int codePoint) {
return data[codePoint>>10][codePoint & 0x3FF];
}
// access data for codepoint
UData get(int codePoint, boolean fixStrings) {
//if (codePoint == lastCode && fixStrings <= lastCodeFixed) return lastResult;
/*
// we play some funny tricks for performance
// if cp is not represented, it is either in a elided block or missing.
// elided blocks are either CONTINUE or FFFF
byte cat;
if (!ucdData.isRepresented(cp)) {
int rep = UCD.mapToRepresentative(cp);
if (rep == 0xFFFF) cat = Cn;
else if (rep != cp) return CONTINUE;
else if (!ucdData.isRepresented(rep)) cat = Cn;
else cat = ucdData.getCategory(rep);
} else {
cat = ucdData.getCategory(cp);
}
*/
UData result = null;
// do range stuff
String constructedName = null;
int rangeStart = mapToRepresentative(codePoint, major < 2);
boolean isHangul = false;
switch (rangeStart) {
case 0xF900:
if (major < 2) {
if (fixStrings) constructedName = "CJK COMPATIBILITY IDEOGRAPH-" + Utility.hex(codePoint, 4);
break;
}
// FALL THROUGH!!!!
default:
result = getRaw(codePoint);
if (result == null) {
result = UData.UNASSIGNED;
if (fixStrings) result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
}
return result;
case 0x3400: // CJK Ideograph Extension A
case 0x4E00: // CJK Ideograph
case 0x20000: // Extension B
if (fixStrings) constructedName = "CJK UNIFIED IDEOGRAPH-" + Utility.hex(codePoint, 4);
break;
case 0xAC00: // Hangul Syllable
isHangul = true;
if (fixStrings) {
constructedName = "HANGUL SYLLABLE " + getHangulName(codePoint);
}
break;
case 0xE000: // Private Use
case 0xF0000: // Private Use
case 0x100000: // Private Use
if (fixStrings) constructedName = "<private use-" + Utility.hex(codePoint, 4) + ">";
break;
case 0xD800: // Surrogate
case 0xDB80: // Private Use
case 0xDC00: // Private Use
if (fixStrings) constructedName = "<surrogate-" + Utility.hex(codePoint, 4) + ">";
break;
case 0xFFFF: // Noncharacter
if (fixStrings) constructedName = "<noncharacter-" + Utility.hex(codePoint, 4) + ">";
break;
}
result = getRaw(rangeStart);
if (result == null) {
result = UData.UNASSIGNED;
if (fixStrings) result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
return result;
}
result.codePoint = codePoint;
if (fixStrings) {
result.name = constructedName;
result.decompositionMapping = result.bidiMirror
= result.simpleLowercase = result.simpleUppercase = result.simpleTitlecase = result.simpleCaseFolding
= result.fullLowercase = result.fullUppercase = result.fullTitlecase = result.fullCaseFolding
= UTF32.valueOf32(codePoint);
}
if (isHangul) {
if (fixStrings) result.decompositionMapping = getHangulDecompositionPair(codePoint);
result.decompositionType = CANONICAL;
}
return result;
}
// Hangul constants
static final int
SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
LCount = 19, VCount = 21, TCount = 28,
NCount = VCount * TCount, // 588
SCount = LCount * NCount, // 11172
LLimit = LBase + LCount, // 1113
VLimit = VBase + VCount, // 1176
TLimit = TBase + TCount, // 11C3
SLimit = SBase + SCount; // D7A4
private static String getHangulName(int s) {
int SIndex = s - SBase;
if (0 > SIndex || SIndex >= SCount) {
throw new IllegalArgumentException("Not a Hangul Syllable: " + s);
}
int LIndex = SIndex / NCount;
int VIndex = (SIndex % NCount) / TCount;
int TIndex = SIndex % TCount;
// if (true) return "?";
return UCD_Names.JAMO_L_TABLE[LIndex] + UCD_Names.JAMO_V_TABLE[VIndex] + UCD_Names.JAMO_T_TABLE[TIndex];
}
private static final char[] pair = new char[2];
static String getHangulDecompositionPair(int ch) {
int SIndex = ch - SBase;
if (0 > SIndex || SIndex >= SCount) {
return "";
}
int TIndex = SIndex % TCount;
if (TIndex != 0) { // triple
pair[0] = (char)(SBase + SIndex - TIndex);
pair[1] = (char)(TBase + TIndex);
} else {
pair[0] = (char)(LBase + SIndex / NCount);
pair[1] = (char)(VBase + (SIndex % NCount) / TCount);
}
return String.valueOf(pair);
}
static int composeHangul(int char1, int char2) {
if (LBase <= char1 && char1 < LLimit && VBase <= char2 && char2 < VLimit) {
return (SBase + ((char1 - LBase) * VCount + (char2 - VBase)) * TCount);
}
if (SBase <= char1 && char1 < SLimit && TBase <= char2 && char2 < TLimit
&& ((char1 - SBase) % TCount) == 0) {
return char1 + (char2 - TBase);
}
return 0xFFFF; // no composition
}
static boolean isTrailingJamo(int cp) {
return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
}
private void fillFromFile(String version) {
DataInputStream dataIn = null;
String fileName = BIN_DIR + "UCD_Data" + version + ".bin";
int uDataFileCount = 0;
try {
dataIn = new DataInputStream(
new BufferedInputStream(
new FileInputStream(fileName),
128*1024));
// header
format = dataIn.readByte();
major = dataIn.readByte();
minor = dataIn.readByte();
update = dataIn.readByte();
String foundVersion = major + "." + minor + "." + update;
if (format != BINARY_FORMAT || !version.equals(foundVersion)) {
throw new ChainException("Illegal data file format for {0}: {1}, {2}",
new Object[]{version, new Byte(format), foundVersion});
}
date = dataIn.readLong();
size = uDataFileCount = dataIn.readInt();
boolean didJoiningHack = false;
// records
for (int i = 0; i < uDataFileCount; ++i) {
UData uData = new UData();
uData.readBytes(dataIn);
if (uData.codePoint == 0x2801) {
System.out.println("SPOT-CHECK: " + uData);
}
//T = Mc + (Cf - ZWNJ - ZWJ)
int cp = uData.codePoint;
byte old = uData.joiningType;
byte cat = uData.generalCategory;
//if (cp == 0x200D) {
// uData.joiningType = JT_C;
//} else
if (cp != 0x200D && cp != 0x200C && (cat == Mn || cat == Cf)) {
uData.joiningType = JT_T;
}
if (!didJoiningHack && uData.joiningType != old) {
System.out.println("HACK: Setting "
+ UCD_Names.LONG_JOINING_TYPE[uData.joiningType]
+ ": " + Utility.hex(cp) + " " + uData.name);
didJoiningHack = true;
}
combiningClassSet.set(uData.combiningClass & 0xFF);
add(uData);
}
/*
if (update == -1) {
throw new ChainException("Data File truncated for ",
new Object[]{version}, e);
}
if (size != fileSize) {
throw new ChainException("Counts do not match: file {0}, records {1}",
new Object[]{new Integer(fileSize), new Integer(size)});
}
*/
// everything is ok!
this.version = version;
this.file = fileName;
//+ " " + new File(fileName).lastModified();
} catch (IOException e) {
throw new ChainException("Can't read data file for {0}", new Object[]{version}, e);
} finally {
if (dataIn != null) {
try {
dataIn.close();
} catch (IOException e) {}
}
}
}
}

View file

@ -0,0 +1,750 @@
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
final class UCD_Names implements UCD_Types {
static final String[] UNIFIED_PROPERTIES = {
"General Category (listing UnicodeData.txt, field 2: see UnicodeData.html)",
"Combining Class (listing UnicodeData.txt, field 3: see UnicodeData.html)",
"Bidi Class (listing UnicodeData.txt, field 4: see UnicodeData.html)",
"Decomposition Type (from UnicodeData.txt, field 5: see UnicodeData.html)",
"Numeric Type (from UnicodeData.txt, field 6/7/8: see UnicodeData.html)",
"East Asian Width (listing EastAsianWidth.txt, field 1)",
"Line Break (listing LineBreak.txt, field 1)",
"Joining Type (listing ArabicShaping.txt, field 1).\r\n"
+ "#\tType T is derived from Mn + Cf - ZWNJ - ZWJ\r\n"
+ "#\tAll other code points have the type U",
"Joining Group (listing ArabicShaping.txt, field 2)",
"BidiMirrored (listing UnicodeData.txt, field 9: see UnicodeData.html)",
"Script",
"Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)"
};
static final String[] SHORT_UNIFIED_PROPERTIES = {
"GeneralCategory",
"CombiningClass",
"BidiClass",
"DecompositionType",
"NumericType",
"EastAsianWidth",
"LineBreak",
"JoiningType",
"JoiningGroup",
"Value",
"Script",
"Age"
};
static final String[] ABB_UNIFIED_PROPERTIES = {
"gc",
"cc",
"bc",
"dt",
"nt",
"ea",
"lb",
"jt",
"jg",
"va",
"sc",
"Ag"
};
static final String[] BP = {
"BidiMirrored",
"CompositionExclusion",
"White_Space",
"NonBreak",
"Bidi_Control",
"Join_Control",
"Dash",
"Hyphen",
"Quotation_Mark",
"Terminal_Punctuation",
"Other_Math",
"Hex_Digit",
"ASCII_Hex_Digit",
"Other_Alphabetic",
"Ideographic",
"Diacritic",
"Extender",
"Other_Lowercase",
"Other_Uppercase",
"Noncharacter_Code_Point",
"CaseFoldTurkishI",
"Other_GraphemeExtend",
"GraphemeLink",
"IDS_BinaryOperator",
"IDS_TrinaryOperator",
"Radical",
"UnifiedIdeograph",
"Reserved_Cf_Code_Point",
"Deprecated",
};
static final String[] SHORT_BP = {
"BidiM",
"CExc",
"WhSp",
"NBrk",
"BdCon",
"JCon",
"Dash",
"Hyph",
"QMark",
"TPunc",
"OMath",
"HexD",
"AHexD",
"OAlph",
"Ideo",
"Diac",
"Ext",
"OLoc",
"OUpc",
"NChar",
"TurkI",
"OGrX",
"GrLink",
"IDSB",
"IDST",
"Radical",
"UCJK",
"RCf",
"Dep",
};
/*
static final String[] BP_OLD = {
"BidiMirrored",
"CompositionExclusion",
"White_space",
"Non_break",
"Bidi_Control",
"Join_Control",
"Dash",
"Hyphen",
"Quotation_Mark",
"Terminal_Punctuation",
"Math",
"Hex_Digit",
"Other_Alphabetic",
"Ideographic",
"Diacritic",
"Extender",
"Other_Lowercase",
"Other_Uppercase",
"Noncharacter_Code_Point",
"Other_GraphemeExtend",
"GraphemeLink",
"IDS_BinaryOperator",
"IDS_TrinaryOperator",
"Radical",
"UnifiedIdeograph"
};
*/
static final String[] DeletedProperties = {
"Private_Use",
"Composite",
"Format_Control",
"High_Surrogate",
"Identifier_Part_Not_Cf",
"Low_Surrogate",
"Other_Format_Control",
"Private_Use_High_Surrogate",
"Unassigned_Code_Point"
};
static final String[] YN_TABLE = {"N", "Y"};
static String[] EA = {
"N", "A", "H", "W", "F", "Na"
};
static String[] SHORT_EA = {
"Neutral", "Ambiguous", "Halfwidth", "Wide", "Fullwidth", "Narrow"
};
static final String[] LB = {
"XX", "OP", "CL", "QU", "GL", "NS", "EX", "SY",
"IS", "PR", "PO", "NU", "AL", "ID", "IN", "HY",
"CM", "BB", "BA", "SP", "BK", "CR", "LF", "CB",
"SA", "AI", "B2", "SG", "ZW"
};
static final String[] LONG_LB = {
"Unknown", "OpenPunctuation", "ClosePunctuation", "Quotation",
"Glue", "Nonstarter", "Exclamation", "BreakSymbols",
"InfixNumeric", "PrefixNumeric", "PostfixNumeric",
"Numeric", "Alphabetic", "Ideographic", "Inseperable", "Hyphen",
"CombiningMark", "BreakBefore", "BreakAfter", "Space",
"MandatoryBreak", "CarriageReturn", "LineFeed", "ContingentBreak",
"ComplexContext", "Ambiguous", "BreakBeforeAndAfter", "Surrogate", "ZWSpace"
};
public static final String[] SCRIPT = {
"COMMON", // COMMON -- NOT A LETTER: NO EXACT CORRESPONDENCE IN 15924
"LATIN", // LATIN
"GREEK", // GREEK
"CYRILLIC", // CYRILLIC
"ARMENIAN", // ARMENIAN
"HEBREW", // HEBREW
"ARABIC", // ARABIC
"SYRIAC", // SYRIAC
"THAANA", // THAANA
"DEVANAGARI", // DEVANAGARI
"BENGALI", // BENGALI
"GURMUKHI", // GURMUKHI
"GUJARATI", // GUJARATI
"ORIYA", // ORIYA
"TAMIL", // TAMIL
"TELUGU", // TELUGU
"KANNADA", // KANNADA
"MALAYALAM", // MALAYALAM
"SINHALA", // SINHALA
"THAI", // THAI
"LAO", // LAO
"TIBETAN", // TIBETAN
"MYANMAR", // MYANMAR
"GEORGIAN", // GEORGIAN
"<unused>", // JAMO -- NOT SEPARATED FROM HANGUL IN 15924
"HANGUL", // HANGUL
"ETHIOPIC", // ETHIOPIC
"CHEROKEE", // CHEROKEE
"CANADIAN-ABORIGINAL", // ABORIGINAL
"OGHAM", // OGHAM
"RUNIC", // RUNIC
"KHMER", // KHMER
"MONGOLIAN", // MONGOLIAN
"HIRAGANA", // HIRAGANA
"KATAKANA", // KATAKANA
"BOPOMOFO", // BOPOMOFO
"HAN", // HAN
"YI", // YI
"OLD-ITALIC",
"GOTHIC",
"DESERET",
"INHERITED",
};
public static final String[] ABB_SCRIPT = {
"Zyyy", // COMMON -- NOT A LETTER: NO EXACT CORRESPONDENCE IN 15924
"Latn", // LATIN
"Grek", // GREEK
"Cyrl", // CYRILLIC
"Armn", // ARMENIAN
"Hebr", // HEBREW
"Arab", // ARABIC
"Syrc", // SYRIAC
"Thaa", // THAANA
"Deva", // DEVANAGARI
"Beng", // BENGALI
"Guru", // GURMUKHI
"Gujr", // GUJARATI
"Orya", // ORIYA
"Taml", // TAMIL
"Telu", // TELUGU
"Knda", // KANNADA
"Mlym", // MALAYALAM
"Sinh", // SINHALA
"Thai", // THAI
"Laoo", // LAO
"Tibt", // TIBETAN
"Mymr", // MYANMAR
"Geor", // GEORGIAN
"<unused>", // JAMO -- NOT SEPARATED FROM HANGUL IN 15924
"Hang", // HANGUL
"Ethi", // ETHIOPIC
"Cher", // CHEROKEE
"Cans", // ABORIGINAL
"Ogam", // OGHAM
"Runr", // RUNIC
"Khmr", // KHMER
"Mong", // MONGOLIAN
"Hira", // HIRAGANA
"Kana", // KATAKANA
"Bopo", // BOPOMOFO
"Hani", // HAN
"Yiii", // YI
"Ital",
"Goth",
"Dsrt",
"Qaai",
};
static final String[] AGE = {
"UNSPECIFIED",
"1.1",
"2.0", "2.1",
"3.0", "3.1"
};
static final String[] GC = {
"Cn", // = Other, Not Assigned 0
"Lu", // = Letter, Uppercase 1
"Ll", // = Letter, Lowercase 2
"Lt", // = Letter, Titlecase 3
"Lm", // = Letter, Modifier 4
"Lo", // = Letter, Other 5
"Mn", // = Mark, Non-Spacing 6
"Me", // = Mark, Enclosing 8
"Mc", // = Mark, Spacing Combining 7
"Nd", // = Number, Decimal Digit 9
"Nl", // = Number, Letter 10
"No", // = Number, Other 11
"Zs", // = Separator, Space 12
"Zl", // = Separator, Line 13
"Zp", // = Separator, Paragraph 14
"Cc", // = Other, Control 15
"Cf", // = Other, Format 16
"<unused>", // missing
"Co", // = Other, Private Use 18
"Cs", // = Other, Surrogate 19
"Pd", // = Punctuation, Dash 20
"Ps", // = Punctuation, Open 21
"Pe", // = Punctuation, Close 22
"Pc", // = Punctuation, Connector 23
"Po", // = Punctuation, Other 24
"Sm", // = Symbol, Math 25
"Sc", // = Symbol, Currency 26
"Sk", // = Symbol, Modifier 27
"So", // = Symbol, Other 28
"Pi", // = Punctuation, Initial quote 29 (may behave like Ps or Pe depending on usage)
"Pf" // = Punctuation, Final quote 30 (may behave like Ps or Pe dependingon usage)
};
static final String[] LONG_GC = {
"Unassigned", // = Other, Not Assigned 0
"UppercaseLetter", // = Letter, Uppercase 1
"LowercaseLetter", // = Letter, Lowercase 2
"TitlecaseLetter", // = Letter, Titlecase 3
"ModifierLetter", // = Letter, Modifier 4
"OtherLetter", // = Letter, Other 5
"NonspacingMark", // = Mark, Non-Spacing 6
"EnclosingMark", // = Mark, Enclosing 8
"SpacingMark", // = Mark, Spacing Combining 7
"DecimalNumber", // = Number, Decimal Digit 9
"LetterNumber", // = Number, Letter 10
"OtherNumber", // = Number, Other 11
"SpaceSeparator", // = Separator, Space 12
"LineSeparator", // = Separator, Line 13
"ParagraphSeparator", // = Separator, Paragraph 14
"Control", // = Other, Control 15
"Format", // = Other, Format 16
"<unused>", // missing
"PrivateUse", // = Other, Private Use 18
"Surrogate", // = Other, Surrogate 19
"DashPunctuation", // = Punctuation, Dash 20
"OpenPunctuation", // = Punctuation, Open 21
"ClosePunctuation", // = Punctuation, Close 22
"ConnectorPunctuation", // = Punctuation, Connector 23
"OtherPunctuation", // = Punctuation, Other 24
"MathSymbol", // = Symbol, Math 25
"CurrencySymbol", // = Symbol, Currency 26
"ModifierSymbol", // = Symbol, Modifier 27
"OtherSymbol", // = Symbol, Other 28
"InitialPunctuation", // = Punctuation, Initial quote 29 (may behave like Ps or Pe depending on usage)
"FinalPunctuation" // = Punctuation, Final quote 30 (may behave like Ps or Pe dependingon usage)
};
static String[] BC = {
"L", // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
"R", // Right-Left; Arabic, Hebrew, and punctuation specific to those scripts
"EN", // European Number
"ES", // European Number Separator
"ET", // European Number Terminator
"AN", // Arabic Number
"CS", // Common Number Separator
"B", // Paragraph Separator
"S", // Segment Separator
"WS", // Whitespace
"ON", // Other Neutrals ; All other characters: punctuation, symbols
"<unused>", "BN", "NSM", "AL", "LRO", "RLO", "LRE", "RLE", "PDF"
};
static String[] LONG_BC = {
"LeftToRight", // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
"RightToLeft", // Right-Left; Arabic, Hebrew, and punctuation specific to those scripts
"EuropeanNumber", // European Number
"EuropeanSeparator", // European Number Separator
"EuropeanTerminator", // European Number Terminator
"ArabicNumber", // Arabic Number
"CommonSeparator", // Common Number Separator
"ParagraphSeparator", // Paragraph Separator
"SegmentSeparator", // Segment Separator
"WhiteSpace", // Whitespace
"OtherNeutral", // Other Neutrals ; All other characters: punctuation, symbols
"<unused>",
"BoundaryNeutral", "NonspacingMark", "ArabicLetter",
"LeftToRightOverride",
"RightToLeftOverride", "LeftToRightEmbedding",
"RightToLeftEmbedding", "PopDirectionalFormat"
};
private static String[] CASE_TABLE = {
"LOWER", "TITLE", "UPPER", "UNCASED"
};
static String[] DT = {
"", // NONE
"canonical", // CANONICAL
"compat", // Otherwise unspecified compatibility character.
"font", // A font variant (e.g. a blackletter form).
"noBreak", // A no-break version of a space or hyphen.
"initial", // // An initial presentation form (Arabic).
"medial", // // A medial presentation form (Arabic).
"final", // // A final presentation form (Arabic).
"isolated", // An isolated presentation form (Arabic).
"circle", // An encircled form.
"super", // A superscript form.
"sub", // A subscript form.
"vertical", // A vertical layout presentation form.
"wide", // A wide (or zenkaku) compatibility character.
"narrow", // A narrow (or hankaku) compatibility character.
"small", // A small variant form (CNS compatibility).
"square", // A CJK squared font variant.
"fraction", // A vulgar fraction form.
};
static String[] SHORT_DT = {
"", // NONE
"ca", // CANONICAL
"co", // Otherwise unspecified compatibility character.
"fo", // A font variant (e.g. a blackletter form).
"nb", // A no-break version of a space or hyphen.
"in", // // An initial presentation form (Arabic).
"me", // // A medial presentation form (Arabic).
"fi", // // A final presentation form (Arabic).
"is", // An isolated presentation form (Arabic).
"ci", // An encircled form.
"sp", // A superscript form.
"sb", // A subscript form.
"ve", // A vertical layout presentation form.
"wi", // A wide (or zenkaku) compatibility character.
"na", // A narrow (or hankaku) compatibility character.
"sm", // A small variant form (CNS compatibility).
"sq", // A CJK squared font variant.
"fr", // A vulgar fraction form.
};
static private String[] MIRRORED_TABLE = {
"N",
"Y"
};
static String[] NT = {
"",
"numeric",
"digit",
"decimal",
};
static String[] SHORT_NT = {
"",
"nu",
"di",
"de",
};
static {
if (LIMIT_CATEGORY != GC.length) {
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: category");
}
if (LIMIT_BIDI_CLASS != BC.length) {
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: bidi");
}
if (LIMIT_LINE_BREAK != LB.length) {
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: linebreak");
}
if (LIMIT_DECOMPOSITION_TYPE != DT.length) {
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: compat type");
}
if (MIRRORED_LIMIT != MIRRORED_TABLE.length) {
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: compat type");
}
if (MIRRORED_LIMIT != MIRRORED_TABLE.length) {
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: compat type");
}
if (CASE_LIMIT != CASE_TABLE.length) {
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: case");
}
if (LIMIT_NUMERIC_TYPE != NT.length) {
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: case");
}
if (LIMIT_EAST_ASIAN_WIDTH != EA.length) {
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: east Asian Width");
}
if (LIMIT_BINARY_PROPERTIES != BP.length) {
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: binary properties");
}
if (LIMIT_SCRIPT != SCRIPT.length) {
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: script");
}
if (LIMIT_AGE != AGE.length) {
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: age");
}
}
public static byte ON = Utility.lookup("ON", BC);
public static String[] JOINING_TYPE = {
"C",
"D",
"R",
"U",
"L",
"T"
};
public static String[] LONG_JOINING_TYPE = {
"JoinCausing",
"DualJoining",
"RightJoining",
"NonJoining",
"LeftJoining",
"Transparent"
};
public static String[] JOINING_GROUP = {
"NO_JOINING_GROUP",
"AIN",
"ALAPH",
"ALEF",
"BEH",
"BETH",
"DAL",
"DALATH_RISH",
"E",
"FEH",
"FINAL_SEMKATH",
"GAF",
"GAMAL",
"HAH",
"HAMZA_ON_HEH_GOAL",
"HE",
"HEH",
"HEH_GOAL",
"HETH",
"KAF",
"KAPH",
"KNOTTED_HEH",
"LAM",
"LAMADH",
"MEEM",
"MIM",
"NOON",
"NUN",
"PE",
"QAF",
"QAPH",
"REH",
"REVERSED_PE",
"SAD",
"SADHE",
"SEEN",
"SEMKATH",
"SHIN",
"SWASH_KAF",
"TAH",
"TAW",
"TEH_MARBUTA",
"TETH",
"WAW",
"YEH",
"YEH_BARREE",
"YEH_WITH_TAIL",
"YUDH",
"YUDH_HE",
"ZAIN",
};
public static String[] OLD_JOINING_GROUP = {
"<no shaping>",
"AIN",
"ALAPH",
"ALEF",
"BEH",
"BETH",
"DAL",
"DALATH RISH",
"E",
"FEH",
"FINAL SEMKATH",
"GAF",
"GAMAL",
"HAH",
"HAMZA ON HEH GOAL",
"HE",
"HEH",
"HEH GOAL",
"HETH",
"KAF",
"KAPH",
"KNOTTED HEH",
"LAM",
"LAMADH",
"MEEM",
"MIM",
"NOON",
"NUN",
"PE",
"QAF",
"QAPH",
"REH",
"REVERSED PE",
"SAD",
"SADHE",
"SEEN",
"SEMKATH",
"SHIN",
"SWASH KAF",
"TAH",
"TAW",
"TEH MARBUTA",
"TETH",
"WAW",
"YEH",
"YEH BARREE",
"YEH WITH TAIL",
"YUDH",
"YUDH HE",
"ZAIN",
};
static String[] JAMO_L_TABLE = {
// Value; Short Name; Unicode Name
"G", // U+1100; G; HANGUL CHOSEONG KIYEOK
"GG", // U+1101; GG; HANGUL CHOSEONG SSANGKIYEOK
"N", // U+1102; N; HANGUL CHOSEONG NIEUN
"D", // U+1103; D; HANGUL CHOSEONG TIKEUT
"DD", // U+1104; DD; HANGUL CHOSEONG SSANGTIKEUT
"R", // U+1105; L; HANGUL CHOSEONG RIEUL
"M", // U+1106; M; HANGUL CHOSEONG MIEUM
"B", // U+1107; B; HANGUL CHOSEONG PIEUP
"BB", // U+1108; BB; HANGUL CHOSEONG SSANGPIEUP
"S", // U+1109; S; HANGUL CHOSEONG SIOS
"SS", // U+110A; SS; HANGUL CHOSEONG SSANGSIOS
"", // U+110B; ; HANGUL CHOSEONG IEUNG
"J", // U+110C; J; HANGUL CHOSEONG CIEUC
"JJ", // U+110D; JJ; HANGUL CHOSEONG SSANGCIEUC
"C", // U+110E; C; HANGUL CHOSEONG CHIEUCH
"K", // U+110F; K; HANGUL CHOSEONG KHIEUKH
"T", // U+1110; T; HANGUL CHOSEONG THIEUTH
"P", // U+1111; P; HANGUL CHOSEONG PHIEUPH
"H" // U+1112; H; HANGUL CHOSEONG HIEUH
};
static String[] JAMO_V_TABLE = {
// Value; Short Name; Unicode Name
"A", // U+1161; A; HANGUL JUNGSEONG A
"AE", // U+1162; AE; HANGUL JUNGSEONG AE
"YA", // U+1163; YA; HANGUL JUNGSEONG YA
"YAE", // U+1164; YAE; HANGUL JUNGSEONG YAE
"EO", // U+1165; EO; HANGUL JUNGSEONG EO
"E", // U+1166; E; HANGUL JUNGSEONG E
"YEO", // U+1167; YEO; HANGUL JUNGSEONG YEO
"YE", // U+1168; YE; HANGUL JUNGSEONG YE
"O", // U+1169; O; HANGUL JUNGSEONG O
"WA", // U+116A; WA; HANGUL JUNGSEONG WA
"WAE", // U+116B; WAE; HANGUL JUNGSEONG WAE
"OE", // U+116C; OE; HANGUL JUNGSEONG OE
"YO", // U+116D; YO; HANGUL JUNGSEONG YO
"U", // U+116E; U; HANGUL JUNGSEONG U
"WEO", // U+116F; WEO; HANGUL JUNGSEONG WEO
"WE", // U+1170; WE; HANGUL JUNGSEONG WE
"WI", // U+1171; WI; HANGUL JUNGSEONG WI
"YU", // U+1172; YU; HANGUL JUNGSEONG YU
"EU", // U+1173; EU; HANGUL JUNGSEONG EU
"YI", // U+1174; YI; HANGUL JUNGSEONG YI
"I", // U+1175; I; HANGUL JUNGSEONG I
};
static String[] JAMO_T_TABLE = {
// Value; Short Name; Unicode Name
"", // filler, for LV syllable
"G", // U+11A8; G; HANGUL JONGSEONG KIYEOK
"GG", // U+11A9; GG; HANGUL JONGSEONG SSANGKIYEOK
"GS", // U+11AA; GS; HANGUL JONGSEONG KIYEOK-SIOS
"N", // U+11AB; N; HANGUL JONGSEONG NIEUN
"NJ", // U+11AC; NJ; HANGUL JONGSEONG NIEUN-CIEUC
"NH", // U+11AD; NH; HANGUL JONGSEONG NIEUN-HIEUH
"D", // U+11AE; D; HANGUL JONGSEONG TIKEUT
"L", // U+11AF; L; HANGUL JONGSEONG RIEUL
"LG", // U+11B0; LG; HANGUL JONGSEONG RIEUL-KIYEOK
"LM", // U+11B1; LM; HANGUL JONGSEONG RIEUL-MIEUM
"LB", // U+11B2; LB; HANGUL JONGSEONG RIEUL-PIEUP
"LS", // U+11B3; LS; HANGUL JONGSEONG RIEUL-SIOS
"LT", // U+11B4; LT; HANGUL JONGSEONG RIEUL-THIEUTH
"LP", // U+11B5; LP; HANGUL JONGSEONG RIEUL-PHIEUPH
"LH", // U+11B6; LH; HANGUL JONGSEONG RIEUL-HIEUH
"M", // U+11B7; M; HANGUL JONGSEONG MIEUM
"B", // U+11B8; B; HANGUL JONGSEONG PIEUP
"BS", // U+11B9; BS; HANGUL JONGSEONG PIEUP-SIOS
"S", // U+11BA; S; HANGUL JONGSEONG SIOS
"SS", // U+11BB; SS; HANGUL JONGSEONG SSANGSIOS
"NG", // U+11BC; NG; HANGUL JONGSEONG IEUNG
"J", // U+11BD; J; HANGUL JONGSEONG CIEUC
"C", // U+11BE; C; HANGUL JONGSEONG CHIEUCH
"K", // U+11BF; K; HANGUL JONGSEONG KHIEUKH
"T", // U+11C0; T; HANGUL JONGSEONG THIEUTH
"P", // U+11C1; P; HANGUL JONGSEONG PHIEUPH
"H", // U+11C2; H; HANGUL JONGSEONG HIEUH
};
/*
static {
UNASSIGNED_INFO.code = '\uFFFF';
UNASSIGNED_INFO.name = "<reserved>";
UNASSIGNED_INFO.decomposition = "";
UNASSIGNED_INFO.fullCanonicalDecomposition = "";
UNASSIGNED_INFO.fullCompatibilityDecomposition = "";
UNASSIGNED_INFO.name10 = "";
UNASSIGNED_INFO.comment = "";
UNASSIGNED_INFO.numericType = NONE;
UNASSIGNED_INFO.decompositionType = NONE;
UNASSIGNED_INFO.category = lookup("Cn",CATEGORY_TABLE, "PROXY");
UNASSIGNED_INFO.canonical = 0;
UNASSIGNED_INFO.uppercase = "";
UNASSIGNED_INFO.lowercase = "";
UNASSIGNED_INFO.titlecase = "";
UNASSIGNED_INFO.bidi = ON;
UNASSIGNED_INFO.mirrored = NO;
}
*/
}

View file

@ -0,0 +1,374 @@
package com.ibm.text.UCD;
public interface UCD_Types {
public static final String DATA_DIR = "C:\\DATA\\";
public static final String BIN_DIR = DATA_DIR + "\\BIN\\";
public static final String GEN_DIR = DATA_DIR + "\\GEN\\";
static final byte BINARY_FORMAT = 5; // bumped if binary format of UCD changes
/*
0 Code value in 4-digit hexadecimal format.
1 Unicode 2.1 Character Name. These names match exactly the
2 General Category. This is a useful breakdown into various "character
3 Canonical Combining Classes. The classes used for the
4 Bidirectional Category. See the list below for an explanation of the
5 Character Decomposition. In the Unicode Standard, not all of
6 Decimal digit value. This is a numeric field. If the character
7 Digit value. This is a numeric field. If the character represents a
8 Numeric value. This is a numeric field. If the character has the
9 If the characters has been identified as a "mirrored" character in
10 Unicode 1.0 Name. This is the old name as published in Unicode 1.0.
11 10646 Comment field. This field is informative.
12 Upper case equivalent mapping. If a character is part of an
13 Lower case equivalent mapping. Similar to 12. This field is informative.
14 Title case equivalent mapping. Similar to 12. This field is informative.
*/
// Binary ENUM Grouping
public static final int
CATEGORY = 0,
COMBINING_CLASS = 0x100,
BIDI_CLASS = 0x200,
DECOMPOSITION_TYPE = 0x300,
NUMERIC_TYPE = 0x400,
EAST_ASIAN_WIDTH = 0x500,
LINE_BREAK = 0x600,
JOINING_TYPE = 0x700,
JOINING_GROUP = 0x800,
BINARY_PROPERTIES = 0x900,
SCRIPT = 0xA00,
AGE = 0xB00,
NEXT_ENUM = 0x100,
LIMIT_ENUM = AGE + 0x100;
public static final int LIMIT_COMBINING_CLASS = 256;
// getCategory
public static final byte
UNASSIGNED = 0,
UPPERCASE_LETTER = 1,
LOWERCASE_LETTER = 2,
TITLECASE_LETTER = 3,
MODIFIER_LETTER = 4,
OTHER_LETTER = 5,
NON_SPACING_MARK = 6,
ENCLOSING_MARK = 7,
COMBINING_SPACING_MARK = 8,
DECIMAL_DIGIT_NUMBER = 9,
LETTER_NUMBER = 10,
OTHER_NUMBER = 11,
SPACE_SEPARATOR = 12,
LINE_SEPARATOR = 13,
PARAGRAPH_SEPARATOR = 14,
CONTROL = 15,
FORMAT = 16,
UNUSED_CATEGORY = 17,
PRIVATE_USE = 18,
SURROGATE = 19,
DASH_PUNCTUATION = 20,
START_PUNCTUATION = 21,
END_PUNCTUATION = 22,
CONNECTOR_PUNCTUATION = 23,
OTHER_PUNCTUATION = 24,
MATH_SYMBOL = 25,
CURRENCY_SYMBOL = 26,
MODIFIER_SYMBOL = 27,
OTHER_SYMBOL = 28,
INITIAL_PUNCTUATION = 29,
FINAL_PUNCTUATION = 30,
LIMIT_CATEGORY = FINAL_PUNCTUATION+1,
// Unicode abbreviations
Lu = UPPERCASE_LETTER,
Ll = LOWERCASE_LETTER,
Lt = TITLECASE_LETTER,
Lm = MODIFIER_LETTER,
Lo = OTHER_LETTER,
Mn = NON_SPACING_MARK,
Me = ENCLOSING_MARK,
Mc = COMBINING_SPACING_MARK,
Nd = DECIMAL_DIGIT_NUMBER,
Nl = LETTER_NUMBER,
No = OTHER_NUMBER,
Zs = SPACE_SEPARATOR,
Zl = LINE_SEPARATOR,
Zp = PARAGRAPH_SEPARATOR,
Cc = CONTROL,
Cf = FORMAT,
Cs = SURROGATE,
Co = PRIVATE_USE,
Cn = UNASSIGNED,
Pc = CONNECTOR_PUNCTUATION,
Pd = DASH_PUNCTUATION,
Ps = START_PUNCTUATION,
Pe = END_PUNCTUATION,
Po = OTHER_PUNCTUATION,
Pi = INITIAL_PUNCTUATION,
Pf = FINAL_PUNCTUATION,
Sm = MATH_SYMBOL,
Sc = CURRENCY_SYMBOL,
Sk = MODIFIER_SYMBOL,
So = OTHER_SYMBOL;
static final int
LETTER_MASK = (1<<Lu) | (1<<Ll) | (1<<Lt) | (1<<Lm) | (1 << Lo),
MARK_MASK = (1<<Mn) | (1<<Me) | (1<<Mc),
NUMBER_MASK = (1<<Nd) | (1<<Nl) | (1<<No),
SEPARATOR_MASK = (1<<Zs) | (1<<Zl) | (1<<Zp),
CONTROL_MASK = (1<<Cc) | (1<<Cf) | (1<<Cs) | (1<<Co),
PUNCTUATION_MASK = (1<<Pc) | (1<<Pd) | (1<<Ps) | (1<<Pe) | (1<<Po) | (1<<Pi) | (1<<Pf),
SYMBOL_MASK = (1<<Sm) | (1<<Sc) | (1<<Sk) | (1<<So),
UNASSIGNED_MASK = (1<<Cn);
// Binary Properties
public static final byte
BidiMirrored = 0,
CompositionExclusion = 1,
White_space = 2,
Non_break = 3,
Bidi_Control = 4,
Join_Control = 5,
Dash = 6,
Hyphen = 7,
Quotation_Mark = 8,
Terminal_Punctuation = 9,
Math_Property = 10,
Hex_Digit = 11,
ASCII_Hex_Digit = 12,
Alphabetic = 13,
Ideographic = 14,
Diacritic = 15,
Extender = 16,
Other_Lowercase = 17,
Other_Uppercase = 18,
Noncharacter_Code_Point = 19,
CaseFoldTurkishI = 20,
Other_GraphemeExtend = 21,
GraphemeLink = 22,
IDS_BinaryOperator = 23,
IDS_TrinaryOperator = 24,
Radical = 25,
UnifiedIdeograph = 26,
Reserved_Cf_Code_Point = 27,
Deprecated = 28,
LIMIT_BINARY_PROPERTIES = 29;
/*
static final int
BidiMirroredMask = 1<<BidiMirrored,
CompositionExclusionMask = 1<<CompositionExclusion,
AlphabeticMask = 1<<Alphabetic,
Bidi_ControlMask = 1<<Bidi_Control,
DashMask = 1<<Dash,
DiacriticMask = 1<<Diacritic,
ExtenderMask = 1<<Extender,
Hex_DigitMask = 1<<Hex_Digit,
HyphenMask = 1<<Hyphen,
IdeographicMask = 1<<Ideographic,
Join_ControlMask = 1<<Join_Control,
Math_PropertyMask = 1<<Math_Property,
Non_breakMask = 1<<Non_break,
Noncharacter_Code_PointMask = 1<<Noncharacter_Code_Point,
Other_LowercaseMask = 1<<Other_Lowercase,
Other_UppercaseMask = 1<<Other_Uppercase,
Quotation_MarkMask = 1<<Quotation_Mark,
Terminal_PunctuationMask = 1<<Terminal_Punctuation,
White_spaceMask = 1<<White_space;
*/
// line break
public static final byte
LBXX = 0, LBOP = 1, LBCL = 2, LBQU = 3, LBGL = 4, LBNS = 5, LBEX = 6, LBSY = 7,
LBIS = 8, LBPR = 9, LBPO = 10, LBNU = 11, LBAL = 12, LBID = 13, LBIN = 14, LBHY = 15,
LBCM = 16, LBBB = 17, LBBA = 18, LBSP = 19, LBBK = 20, LBCR = 21, LBLF = 22, LBCB = 23,
LBSA = 24, LBAI = 25, LBB2 = 26, LBSG = 27, LBZW = 28, LIMIT_LINE_BREAK = 29;
// east asian width
public static final byte
EAN = 0, EAA = 1, EAH = 2, EAW = 3, EAF = 4, EANa = 5,
LIMIT_EAST_ASIAN_WIDTH = 6;
// bidi class
static final byte
BIDI_L = 0, // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
BIDI_R = 1, // Right-Left; Arabic, Hebrew, and punctuation specific to those scripts
BIDI_EN = 2, // European Number
BIDI_ES = 3, // European Number Separator
BIDI_ET = 4, // European Number Terminator
BIDI_AN = 5, // Arabic Number
BIDI_CS = 6, // Common Number Separator
BIDI_B = 7, // Block Separator
BIDI_S = 8, // Segment Separator
BIDI_WS = 9, // Whitespace
BIDI_ON = 10, // Other Neutrals ; All other characters: punctuation, symbols
LIMIT_BIDI_2 = 11,
BIDI_UNUSED = 11,
BIDI_BN = 12,
BIDI_NSM = 13,
BIDI_AL = 14,
BIDI_LRO = 15,
BIDI_RLO = 16,
BIDI_LRE = 17,
BIDI_RLE = 18,
BIDI_PDF = 19,
LIMIT_BIDI_CLASS = 20;
// decompositionType
static final byte NONE = 0,
CANONICAL = 1,
COMPATIBILITY = 2,
COMPAT_UNSPECIFIED = 2, // Otherwise unspecified compatibility character.
COMPAT_FONT = 3, // A font variant (e.g. a blackletter form).
COMPAT_NOBREAK = 4, // A no-break version of a space or hyphen.
COMPAT_INITIAL = 5, // // An initial presentation form (Arabic).
COMPAT_MEDIAL = 6, // // A medial presentation form (Arabic).
COMPAT_FINAL = 7, // // A final presentation form (Arabic).
COMPAT_ISOLATED = 8, // An isolated presentation form (Arabic).
COMPAT_CIRCLE = 9, // An encircled form.
COMPAT_SUPER = 10, // A superscript form.
COMPAT_SUB = 11, // A subscript form.
COMPAT_VERTICAL = 12, // A vertical layout presentation form.
COMPAT_WIDE = 13, // A wide (or zenkaku) compatibility character.
COMPAT_NARROW = 14, // A narrow (or hankaku) compatibility character.
COMPAT_SMALL = 15, // A small variant form (CNS compatibility).
COMPAT_SQUARE = 16, // A CJK squared font variant.
COMPAT_FRACTION = 17, // A vulgar fraction form.
LIMIT_DECOMPOSITION_TYPE = 18;
// mirrored type
static final byte NO = 0, YES = 1, MIRRORED_LIMIT = 2;
// for QuickCheck
static final byte QNO = 0, QMAYBE = 1, QYES = 2;
// case type
static final byte LOWER = 0, TITLE = 1, UPPER = 2, UNCASED = 3, FOLD = 3, CASE_LIMIT = 4;
static final byte SIMPLE = 0, FULL = 8;
// normalization type
static final byte UNNORMALIZED = 0, C = 1, KC = 2, D = 3, KD = 4, FORM_LIMIT = 5;
// numericType
static final byte NUMERIC_NONE = 0, NUMERIC = 1, DIGIT = 2, DECIMAL = 3,
LIMIT_NUMERIC_TYPE = 4;
public static final byte // SCRIPT CODE
COMMON_SCRIPT = 0,
LATIN_SCRIPT = 1,
GREEK_SCRIPT = 2,
CYRILLIC_SCRIPT = 3,
ARMENIAN_SCRIPT = 4,
HEBREW_SCRIPT = 5,
ARABIC_SCRIPT = 6,
SYRIAC_SCRIPT = 7,
THAANA_SCRIPT = 8,
DEVANAGARI_SCRIPT = 9,
BENGALI_SCRIPT = 10,
GURMUKHI_SCRIPT = 11,
GUJARATI_SCRIPT = 12,
ORIYA_SCRIPT = 13,
TAMIL_SCRIPT = 14,
TELUGU_SCRIPT = 15,
KANNADA_SCRIPT = 16,
MALAYALAM_SCRIPT = 17,
SINHALA_SCRIPT = 18,
THAI_SCRIPT = 19,
LAO_SCRIPT = 20,
TIBETAN_SCRIPT = 21,
MYANMAR_SCRIPT = 22,
GEORGIAN_SCRIPT = 23,
UNUSED_SCRIPT = 24,
HANGUL_SCRIPT = 25,
ETHIOPIC_SCRIPT = 26,
CHEROKEE_SCRIPT = 27,
ABORIGINAL_SCRIPT = 28,
OGHAM_SCRIPT = 29,
RUNIC_SCRIPT = 30,
KHMER_SCRIPT = 31,
MONGOLIAN_SCRIPT = 32,
HIRAGANA_SCRIPT = 33,
KATAKANA_SCRIPT = 34,
BOPOMOFO_SCRIPT = 35,
HAN_SCRIPT = 36,
YI_SCRIPT = 37,
OLD_ITALIC_SCRIPT = 38,
GOTHIC_SCRIPT = 39,
DESERET_SCRIPT = 40,
INHERITED_SCRIPT = 41,
LIMIT_SCRIPT = 42;
static final int
UNKNOWN = 0,
AGE10 = 1,
AGE20 = 2,
AGE21 = 3,
AGE30 = 4,
AGE31 = 5,
LIMIT_AGE = 6;
public static byte
JT_C = 0,
JT_D = 1,
JT_R = 2,
JT_U = 3,
JT_L = 4,
JT_T = 5,
LIMIT_JOINING_TYPE = 6;
public static byte
NO_SHAPING = 0,
AIN = 1,
ALAPH = 2,
ALEF = 3,
BEH = 4,
BETH = 5,
DAL = 6,
DALATH_RISH = 7,
E = 8,
FEH = 9,
FINAL_SEMKATH = 10,
GAF = 11,
GAMAL = 12,
HAH = 13,
HAMZA_ON_HEH_GOAL = 14,
HE = 15,
HEH = 16,
HEH_GOAL = 17,
HETH = 18,
KAF = 19,
KAPH = 20,
KNOTTED_HEH = 21,
LAM = 22,
LAMADH = 23,
MEEM = 24,
MIM = 25,
NOON = 26,
NUN = 27,
PE = 28,
QAF = 29,
QAPH = 30,
REH = 31,
REVERSED_PE = 32,
SAD = 33,
SADHE = 34,
SEEN = 35,
SEMKATH = 36,
SHIN = 37,
SWASH_KAF = 38,
TAH = 39,
TAW = 40,
TEH_MARBUTA = 41,
TETH = 42,
WAW = 43,
YEH = 44,
YEH_BARREE = 45,
YEH_WITH_TAIL = 46,
YUDH = 47,
YUDH_HE = 48,
ZAIN = 49,
LIMIT_JOINING_GROUP = 50;
}

View file

@ -0,0 +1,317 @@
package com.ibm.text.UCD;
import java.io.*;
import com.ibm.text.utility.*;
class UData implements UCD_Types {
String name;
String decompositionMapping;
String simpleUppercase;
String simpleLowercase;
String simpleTitlecase;
String simpleCaseFolding;
String fullUppercase;
String fullLowercase;
String fullTitlecase;
String fullCaseFolding;
String specialCasing = "";
String bidiMirror;
int codePoint = -1;
float numericValue = Float.NaN;
int binaryProperties; // bidiMirroring, compositionExclusions, PropList
byte generalCategory = Cn;
byte combiningClass = 0;
byte bidiClass = BIDI_ON;
byte decompositionType = NONE;
byte numericType = NUMERIC_NONE;
byte eastAsianWidth = EAN;
byte lineBreak = LBXX;
byte joiningType = JT_U;
byte joiningGroup = NO_SHAPING;
byte script = COMMON_SCRIPT;
byte age = 0;
static final UData UNASSIGNED = new UData();
//static final UData NONCHARACTER = new UData();
static {
UNASSIGNED.name = "<unassigned>";
UNASSIGNED.decompositionMapping = UNASSIGNED.bidiMirror
= UNASSIGNED.simpleUppercase
= UNASSIGNED.simpleLowercase
= UNASSIGNED.simpleTitlecase = "";
UNASSIGNED.fleshOut();
/*NONCHARACTER.name = "<noncharacter>";
NONCHARACTER.decompositionMapping = NONCHARACTER.bidiMirror
= NONCHARACTER.simpleUppercase
= NONCHARACTER.simpleLowercase
= NONCHARACTER.simpleTitlecase = "";
NONCHARACTER.binaryProperties = Noncharacter_Code_PointMask;
NONCHARACTER.fleshOut();
*/
}
public UData (int codePoint) {
this.codePoint = codePoint;
}
public UData () {
}
public boolean equals(Object that) {
UData other = (UData) that;
if (!name.equals(other.name)) return false;
if (!decompositionMapping.equals(other.decompositionMapping)) return false;
if (!simpleUppercase.equals(other.simpleUppercase)) return false;
if (!simpleLowercase.equals(other.simpleLowercase)) return false;
if (!simpleTitlecase.equals(other.simpleTitlecase)) return false;
if (!simpleCaseFolding.equals(other.simpleCaseFolding)) return false;
if (!fullUppercase.equals(other.fullUppercase)) return false;
if (!fullLowercase.equals(other.fullLowercase)) return false;
if (!fullTitlecase.equals(other.fullTitlecase)) return false;
if (!fullCaseFolding.equals(other.fullCaseFolding)) return false;
if (!specialCasing.equals(other.specialCasing)) return false;
if (!bidiMirror.equals(other.bidiMirror)) return false;
if (codePoint != other.codePoint) return false;
if (numericValue != other.numericValue) return false;
if (binaryProperties != other.binaryProperties) return false;
if (generalCategory != other.generalCategory) return false;
if (combiningClass != other.combiningClass) return false;
if (bidiClass != other.bidiClass) return false;
if (decompositionType != other.decompositionType) return false;
if (numericType != other.numericType) return false;
if (eastAsianWidth != other.eastAsianWidth) return false;
if (lineBreak != other.lineBreak) return false;
if (joiningType != other.joiningType) return false;
if (joiningGroup != other.joiningGroup) return false;
if (script != other.script) return false;
if (age != other.age) return false;
return true;
}
public void fleshOut() {
String codeValue = UTF32.valueOf32(codePoint);
if (decompositionMapping == null) decompositionMapping = codeValue;
if (bidiMirror == null) bidiMirror = codeValue;
if (simpleLowercase == null) simpleLowercase = codeValue;
if (simpleCaseFolding == null) simpleCaseFolding = simpleLowercase;
if (fullLowercase == null) fullLowercase = simpleLowercase;
if (fullCaseFolding == null) fullCaseFolding = fullLowercase;
if (simpleUppercase == null) simpleUppercase = codeValue;
if (simpleTitlecase == null) simpleTitlecase = codeValue;
if (fullUppercase == null) fullUppercase = simpleUppercase;
if (fullTitlecase == null) fullTitlecase = simpleTitlecase;
}
public void compact() {
fleshOut();
String codeValue = UTF32.valueOf32(codePoint);
if (fullTitlecase.equals(simpleTitlecase)) fullTitlecase = null;
if (fullUppercase.equals(simpleUppercase)) fullUppercase = null;
if (simpleTitlecase.equals(codeValue)) simpleTitlecase = null;
if (simpleUppercase.equals(codeValue)) simpleUppercase = null;
if (fullCaseFolding.equals(fullLowercase)) fullCaseFolding = null;
if (fullLowercase.equals(simpleLowercase)) fullLowercase = null;
if (simpleCaseFolding.equals(simpleLowercase)) simpleCaseFolding = null;
if (simpleLowercase.equals(codeValue)) simpleLowercase = null;
if (decompositionMapping.equals(codeValue)) decompositionMapping = null;
if (bidiMirror.equals(codeValue)) bidiMirror = null;
}
public void setBinaryProperties(int binaryProperties) {
this.binaryProperties = binaryProperties;
}
public boolean isLetter() {
return ((1<<generalCategory) & UCD_Types.LETTER_MASK) != 0;
}
public static void writeString(DataOutputStream os, String s) throws IOException {
if (s == null) {
os.writeByte(0);
} else {
os.writeByte(1);
os.writeUTF(s);
}
}
static final byte[] byteBuffer = new byte[256];
public static String readString(DataInputStream is) throws IOException {
int type = is.readUnsignedByte();
if (type == 0) return null;
return is.readUTF();
}
static final byte ABBREVIATED = 0, FULL = 1;
public String toString() {
return toString(FULL);
}
public String toString(byte style) {
boolean full = style == FULL;
StringBuffer result = new StringBuffer();
String s = UTF32.valueOf32(codePoint);
result.append("<e c='").append(Utility.quoteXML(codePoint)).append('\'');
result.append(" hx='").append(Utility.hex(codePoint)).append('\'');
if (full || script != COMMON_SCRIPT) result.append(" sn='").append(UCD_Names.SCRIPT[script]).append('\'');
result.append(" n='").append(Utility.quoteXML(name)).append("'\r\n");
int lastPos = result.length();
if (full || generalCategory != Lo) result.append(" gc='").append(UCD_Names.GC[generalCategory]).append('\'');
if (full || combiningClass != 0) result.append(" cc='").append(combiningClass & 0xFF).append('\'');
if (full || decompositionType != NONE) result.append(" dt='").append(UCD_Names.DT[decompositionType]).append('\'');
if (full || !s.equals(decompositionMapping)) result.append(" dm='").append(Utility.quoteXML(decompositionMapping)).append('\'');
if (full || numericType != NUMERIC_NONE) result.append(" nt='").append(UCD_Names.NT[numericType]).append('\'');
if (full || !Double.isNaN(numericValue)) result.append(" nv='").append(numericValue).append('\'');
if (full || eastAsianWidth != EAN) result.append(" ea='").append(UCD_Names.EA[eastAsianWidth]).append('\'');
if (full || lineBreak != LBAL) result.append(" lb='").append(UCD_Names.LB[lineBreak]).append('\'');
if (full || joiningType != JT_U) result.append(" jt='").append(UCD_Names.JOINING_TYPE[joiningType]).append('\'');
if (full || joiningGroup != NO_SHAPING) result.append(" jg='").append(UCD_Names.JOINING_GROUP[joiningGroup]).append('\'');
if (full || age != 0) result.append(" ag='").append(UCD_Names.AGE[age]).append('\'');
if (full || bidiClass != BIDI_L) result.append(" bc='").append(UCD_Names.BC[bidiClass]).append('\'');
if (full || !bidiMirror.equals(s)) result.append(" bmg='").append(Utility.quoteXML(bidiMirror)).append('\'');
if (lastPos != result.length()) {
result.append("\r\n");
lastPos = result.length();
}
//String bp = "";
int bprops = binaryProperties;
for (int i = 0; i < LIMIT_BINARY_PROPERTIES; ++i) {
if ((bprops & (1<<i)) != 0) result.append(UCD_Names.BP[i]).append("='T' ");
}
if (lastPos != result.length()) {
result.append("\r\n");
lastPos = result.length();
}
if (full || !fullLowercase.equals(s)) result.append(" lc='").append(Utility.quoteXML(fullLowercase)).append('\'');
if (full || !fullUppercase.equals(simpleUppercase)) result.append(" uc='").append(Utility.quoteXML(fullUppercase)).append('\'');
if (full || !fullTitlecase.equals(fullUppercase)) result.append(" tc='").append(Utility.quoteXML(fullTitlecase)).append('\'');
if (full || !fullCaseFolding.equals(fullLowercase)) result.append(" cf='").append(Utility.quoteXML(fullCaseFolding)).append('\'');
if (full || !simpleLowercase.equals(simpleLowercase)) result.append(" slc='").append(Utility.quoteXML(simpleLowercase)).append('\'');
if (full || !simpleUppercase.equals(simpleUppercase)) result.append(" suc='").append(Utility.quoteXML(simpleUppercase)).append('\'');
if (full || !simpleTitlecase.equals(simpleUppercase)) result.append(" stc='").append(Utility.quoteXML(simpleTitlecase)).append('\'');
if (full || !simpleCaseFolding.equals(simpleLowercase)) result.append(" sfc='").append(Utility.quoteXML(simpleCaseFolding)).append('\'');
if (full || !specialCasing.equals("")) result.append(" fsc='").append(Utility.quoteXML(specialCasing)).append('\'');
result.append("/>");
return result.toString();
}
public void writeBytes(DataOutputStream os) throws IOException {
compact();
os.writeInt(codePoint);
writeString(os, name);
writeString(os, decompositionMapping);
writeString(os, simpleUppercase);
writeString(os, simpleLowercase);
writeString(os, simpleTitlecase);
writeString(os, simpleCaseFolding);
writeString(os, fullUppercase);
writeString(os, fullLowercase);
writeString(os, fullTitlecase);
writeString(os, fullCaseFolding);
writeString(os, specialCasing);
writeString(os, bidiMirror);
os.writeFloat(numericValue);
os.writeInt(binaryProperties);
os.writeByte(generalCategory);
os.writeByte(combiningClass);
os.writeByte(bidiClass);
os.writeByte(decompositionType);
os.writeByte(numericType);
os.writeByte(eastAsianWidth);
os.writeByte(lineBreak);
os.writeByte(joiningType);
os.writeByte(joiningGroup);
os.writeByte(script);
os.writeByte(age);
}
public void readBytes(DataInputStream is) throws IOException {
codePoint = is.readInt();
name = readString(is);
decompositionMapping = readString(is);
simpleUppercase = readString(is);
simpleLowercase = readString(is);
simpleTitlecase = readString(is);
simpleCaseFolding = readString(is);
fullUppercase = readString(is);
fullLowercase = readString(is);
fullTitlecase = readString(is);
fullCaseFolding = readString(is);
specialCasing = readString(is);
bidiMirror = readString(is);
numericValue = is.readFloat();
binaryProperties = is.readInt();
generalCategory = is.readByte();
combiningClass = is.readByte();
bidiClass = is.readByte();
decompositionType = is.readByte();
numericType = is.readByte();
eastAsianWidth = is.readByte();
lineBreak = is.readByte();
joiningType = is.readByte();
joiningGroup = is.readByte();
script = is.readByte();
age = is.readByte();
fleshOut();
// HACK
/*
int bp = binaryProperties;
bp &= ~(1 << CaseFoldTurkishI); // clear bit
if (codePoint == 'i' || codePoint == 'I') {
bp |= (1 << CaseFoldTurkishI);
}
if (bp != binaryProperties) {
if (!HACK) {
System.out.println("\tHACK Resetting CaseFoldTurkishI on U+" + Utility.hex(codePoint) + " " + name + " and others...");
HACK = true;
}
binaryProperties = bp;
}
*/
/*
if (generalCategory == Sm) {
if ((binaryProperties & Math_PropertyMask) != 0) {
if (!HACK) {
System.out.println("Stripping " + Utility.hex(codePoint) + " " + name + " and others...");
HACK = true;
}
binaryProperties &= ~Math_PropertyMask;
}
}
*/
}
static boolean HACK = false;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,115 @@
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
//import java.text.*;
import com.ibm.text.utility.*;
public class WriteJavaScriptInfo {
/* TODO: fix enumeration of compositions
static public void writeJavascriptInfo() throws IOException {
System.err.println("Writing Javascript data");
UCD ucd = UCD.make();
Normalizer normKD = new Normalizer(Normalizer.NFKD);
Normalizer normD = new Normalizer(Normalizer.NFD);
PrintWriter log = new PrintWriter(new FileOutputStream("Normalization_data.js"));
int count = 0;
int datasize = 0;
int max = 0;
int over7 = 0;
log.println("var KD = new Object(); // NFKD compatibility decomposition mappings");
log.println("// NOTE: Hangul is done in code!");
CompactShortArray csa = new CompactShortArray((short)0);
for (char c = 0; c < 0xFFFF; ++c) {
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
if (0xAC00 <= c && c <= 0xD7A3) continue;
if (normKD.hasDecomposition(c)) {
++count;
String decomp = normKD.normalize(c);
datasize += decomp.length();
if (max < decomp.length()) max = decomp.length();
if (decomp.length() > 7) ++over7;
csa.setElementAt(c, (short)count);
log.println("\t KD[0x" + Utility.hex(c) + "]='\\u" + Utility.hex(decomp,"\\u") + "';");
}
}
csa.compact();
log.println("// " + count + " NFKD mappings total");
log.println("// " + datasize + " total characters of results");
log.println("// " + max + " string length, maximum");
log.println("// " + over7 + " result strings with length > 7");
log.println("// " + csa.storage() + " trie length (doesn't count string size)");
log.println();
count = 0;
datasize = 0;
max = 0;
log.println("var D = new Object(); // NFD canonical decomposition mappings");
log.println("// NOTE: Hangul is done in code!");
csa = new CompactShortArray((short)0);
for (char c = 0; c < 0xFFFF; ++c) {
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
if (0xAC00 <= c && c <= 0xD7A3) continue;
if (normD.hasDecomposition(c)) {
++count;
String decomp = normD.normalize(c);
datasize += decomp.length();
if (max < decomp.length()) max = decomp.length();
csa.setElementAt(c, (short)count);
log.println("\t D[0x" + Utility.hex(c) + "]='\\u" + Utility.hex(decomp,"\\u") + "';");
}
}
csa.compact();
log.println("// " + count + " NFD mappings total");
log.println("// " + datasize + " total characters of results");
log.println("// " + max + " string length, maximum");
log.println("// " + csa.storage() + " trie length (doesn't count string size)");
log.println();
count = 0;
datasize = 0;
log.println("var CC = new Object(); // canonical class mappings");
CompactByteArray cba = new CompactByteArray();
for (char c = 0; c < 0xFFFF; ++c) {
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
int canClass = normKD.getCanonicalClass(c);
if (canClass != 0) {
++count;
log.println("\t CC[0x" + Utility.hex(c) + "]=" + canClass + ";");
}
}
cba.compact();
log.println("// " + count + " canonical class mappings total");
log.println("// " + cba.storage() + " trie length");
log.println();
count = 0;
datasize = 0;
log.println("var C = new Object(); // composition mappings");
log.println("// NOTE: Hangul is done in code!");
IntHashtable.IntEnumeration enum = normKD.getD getComposition();
while (enum.hasNext()) {
int key = enum.next();
char val = (char) enum.value();
if (0xAC00 <= val && val <= 0xD7A3) continue;
++count;
log.println("\tC[0x" + Utility.hex(key) + "]=0x" + Utility.hex(val) + ";");
}
log.println("// " + count + " composition mappings total");
log.println();
log.close();
System.err.println("Done writing Javascript data");
}
*/
}

View file

@ -0,0 +1,38 @@
package com.ibm.text.utility;
import java.text.*;
import java.io.*;
public class ChainException extends RuntimeException {
Object[] keyData;
String messageFormat;
Exception chain;
public ChainException (String messageFormat, Object[] objects) {
this.messageFormat = messageFormat;
keyData = (Object[]) objects.clone();
}
public ChainException (String messageFormat, Object[] objects, Exception chainedException) {
this.messageFormat = messageFormat;
keyData = objects == null ? null : (Object[]) objects.clone();
chain = chainedException;
}
public String getMessage() {
String chainMsg = "";
if (chain != null) {
chainMsg = "; " + chain.getClass().getName()
+ ", " + chain.getMessage();
StringWriter w = new StringWriter();
PrintWriter p = new PrintWriter(w);
chain.printStackTrace(p);
chainMsg += ", " + w.getBuffer();
p.close();
}
String main = "";
if (keyData != null) main = MessageFormat.format(messageFormat, keyData);
return main + chainMsg;
}
}

View file

@ -0,0 +1,305 @@
package com.ibm.text.utility;
/*
* %W% %E%
*
* (C) Copyright Taligent, Inc. 1996 - All Rights Reserved
* (C) Copyright IBM Corp. 1996 - All Rights Reserved
*
* Portions copyright (c) 1996 Sun Microsystems, Inc. All Rights Reserved.
*
* The original version of this source code and documentation is copyrighted
* and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
* materials are provided under terms of a License Agreement between Taligent
* and Sun. This technology is protected by multiple US and International
* patents. This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*
* Permission to use, copy, modify, and distribute this software
* and its documentation for NON-COMMERCIAL purposes and without
* fee is hereby granted provided that this copyright notice
* appears in all copies. Please refer to the file "copyright.html"
* for further important copyright and licensing information.
*
* SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
* THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
* TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
* ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
* DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
*/
import java.io.*;
/**
*
* Provides a compact way to store information that is indexed by Unicode
* values, such as character properties, types, keyboard values, etc.
* only for internal use for now. Made public for discussion purposes.
*
* @see CompactIntArray
* @see CompactShortArray
* @version %I% %G%
* @author Helena Shih
*/
public final class CompactByteArray implements Serializable {
public static final int UNICODECOUNT =65536;
public CompactByteArray()
{
this((byte)0);
}
public CompactByteArray(byte defaultValue)
{
int i;
values = new byte[UNICODECOUNT];
indices = new short[INDEXCOUNT];
for (i = 0; i < UNICODECOUNT; ++i) {
values[i] = defaultValue;
}
for (i = 0; i < INDEXCOUNT; ++i) {
indices[i] = (short)(i<<BLOCKSHIFT);
}
isCompact = false;
}
public CompactByteArray(short indexArray[],
byte newValues[]) throws IllegalArgumentException
{
int i;
if (indexArray.length != INDEXCOUNT)
throw new IllegalArgumentException();
for (i = 0; i < INDEXCOUNT; ++i) {
short index = indexArray[i];
if ((index < 0) || (index >= newValues.length+BLOCKCOUNT))
throw new IllegalArgumentException();
}
indices = indexArray;
values = newValues;
isCompact = true;
}
public void writeArrays(PrintWriter output)
{
int i;
output.println("package com.ibm.text.unicode;");
output.println("import com.ibm.text.collections.*;");
output.println("public final class GeneralCategory {");
output.println(" public static byte getCategory (char ch) {");
output.println(" return compactArray.elementAt(ch);");
output.println(" }");
output.println(" static CompactByteArray compactArray;");
output.println(" static void init () {");
output.println(" short[] index = {");
for (i = 0; i < indices.length; i++) {
if (i % 8 == 0) output.println();
output.print("(short)" + (indices[i] & 0xFFFF) + ", ");
}
output.println(" };");
output.println(" byte[] data = {");
for (i = 0; i < values.length; i++) {
if (i % 8 == 0) output.println();
output.print("(byte)" + (values[i] & 0xFF) + ", ");
}
output.println(" };");
output.println(" compactArray = new CompactByteArray(index, data);");
output.println(" }");
output.println("}");
output.close();
}
public byte elementAt(char index) // parameterized on byte
{
return (values[(indices[index >>> BLOCKSHIFT] & 0xFFFF) +
(index & BLOCKMASK)]);
}
// Set automatically expands the array if it is compacted.
// parameterized on value (byte)
public void setElementAt(char index, byte value)
{
if (isCompact)
expand();
values[(int)index] = value;
}
public void setElementAt(char start, char end, byte value)
{
int i;
if (isCompact) {
expand();
}
for (i = start; i <= end; ++i) {
values[i] = value;
}
}
// Compact the array.
// The value of cycle determines how large the overlap can be.
// A cycle of 1 is the most compacted, but takes the most time to do.
// If values stored in the array tend to repeat in cycles of, say, 16,
// then using that will be faster than cycle = 1, and get almost the
// same compression. cycle is hardcoded as BLOCKCOUNT now.
public void compact()
{
if (isCompact == false) {
char[] tempIndex;
int tempIndexCount;
byte[] tempArray;
short iBlock, iIndex;
// make temp storage, larger than we need
tempIndex = new char[UNICODECOUNT];
// set up first block.
tempIndexCount = BLOCKCOUNT;
for (iIndex = 0; iIndex < BLOCKCOUNT; ++iIndex) {
tempIndex[iIndex] = (char)iIndex;
}; // endfor (iIndex = 0; .....)
indices[0] = (short)0;
// for each successive block, find out its first position
// in the compacted array
for (iBlock = 1; iBlock < INDEXCOUNT; ++iBlock) {
int newCount, firstPosition, block;
block = iBlock<<BLOCKSHIFT;
if (DEBUGSMALL) if (block > DEBUGSMALLLIMIT) break;
firstPosition = FindOverlappingPosition( block, tempIndex,
tempIndexCount );
newCount = firstPosition + BLOCKCOUNT;
if (newCount > tempIndexCount) {
for (iIndex = (short)tempIndexCount;
iIndex < newCount;
++iIndex) {
tempIndex[iIndex] = (char)
(iIndex - firstPosition + block);
} // endfor (iIndex = tempIndexCount....)
tempIndexCount = newCount;
} // endif (newCount > tempIndexCount)
indices[iBlock] = (short)firstPosition;
} // endfor (iBlock = 1.....)
// now allocate and copy the items into the array
tempArray = new byte[tempIndexCount];
for (iIndex = 0; iIndex < tempIndexCount; ++iIndex) {
tempArray[iIndex] = values[tempIndex[iIndex]];
}
values = null;
values = tempArray;
isCompact = true;
} // endif (isCompact != false)
}
// Expanded takes the array back to a 65536 element array
public void expand()
{
int i;
if (isCompact) {
byte[] tempArray;
tempArray = new byte[UNICODECOUNT];
for (i = 0; i < UNICODECOUNT; ++i) {
tempArray[i] = elementAt((char)i);
}
for (i = 0; i < INDEXCOUNT; ++i) {
indices[i] = (short)(i<<BLOCKSHIFT);
}
values = null;
values = tempArray;
isCompact = false;
}
}
// Print char Array : Debug only
public void printIndex(short start, short count)
{
int i;
for (i = start; i < count; ++i)
{
System.out.println(i + " -> : " +
(int)((indices[i] >= 0) ?
indices[i] :
indices[i] + UNICODECOUNT));
}
System.out.println();
}
public void printPlainArray(int start,int count, char[] tempIndex)
{
int iIndex;
if (tempIndex != null)
{
for (iIndex = start; iIndex < start + count; ++iIndex)
{
System.out.print(" " + (int)values[tempIndex[iIndex]]);
}
}
else
{
for (iIndex = start; iIndex < start + count; ++iIndex)
{
System.out.print(" " + (int)values[iIndex]);
}
}
System.out.println(" Range: start " + start + " , count " + count);
}
// # of elements in the indexed array
public short capacity()
{
return (short)values.length;
}
public int storage()
{
return values.length * 1 + indices.length * 2 + 12;
}
private byte[] getArray()
{
return values;
}
private int
FindOverlappingPosition(int start, char[] tempIndex, int tempIndexCount)
{
int i;
short j;
short currentCount;
if (DEBUGOVERLAP && start < DEBUGSHOWOVERLAPLIMIT) {
printPlainArray(start, BLOCKCOUNT, null);
printPlainArray(0, tempIndexCount, tempIndex);
}
for (i = 0; i < tempIndexCount; i += BLOCKCOUNT) {
currentCount = (short)BLOCKCOUNT;
if (i + BLOCKCOUNT > tempIndexCount) {
currentCount = (short)(tempIndexCount - i);
}
for (j = 0; j < currentCount; ++j) {
if (values[start + j] != values[tempIndex[i + j]]) break;
}
if (j == currentCount) break;
}
if (DEBUGOVERLAP && start < DEBUGSHOWOVERLAPLIMIT) {
for (j = 1; j < i; ++j) {
System.out.print(" ");
}
printPlainArray(start, BLOCKCOUNT, null);
System.out.println(" Found At: " + i);
}
return i;
}
private static final int DEBUGSHOWOVERLAPLIMIT = 100;
private static final boolean DEBUGTRACE = false;
private static final boolean DEBUGSMALL = false;
private static final boolean DEBUGOVERLAP = false;
private static final int DEBUGSMALLLIMIT = 30000;
private static final int BLOCKSHIFT =6;
private static final int BLOCKCOUNT =(1<<BLOCKSHIFT);
private static final int INDEXSHIFT =(16-BLOCKSHIFT);
private static final int INDEXCOUNT =(1<<INDEXSHIFT);
private static final int BLOCKMASK = BLOCKCOUNT - 1;
private byte[] values; // char -> short (char parameterized short)
private short indices[];
private boolean isCompact;
};

View file

@ -0,0 +1,367 @@
package com.ibm.text.utility;
/*
* %W% %E%
*
* (C) Copyright Taligent, Inc. 1996 - All Rights Reserved
* (C) Copyright IBM Corp. 1996 - All Rights Reserved
*
* Portions copyright (c) 1996 Sun Microsystems, Inc. All Rights Reserved.
*
* The original version of this source code and documentation is copyrighted
* and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
* materials are provided under terms of a License Agreement between Taligent
* and Sun. This technology is protected by multiple US and International
* patents. This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*
* Permission to use, copy, modify, and distribute this software
* and its documentation for NON-COMMERCIAL purposes and without
* fee is hereby granted provided that this copyright notice
* appears in all copies. Please refer to the file "copyright.html"
* for further important copyright and licensing information.
*
* SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
* THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
* TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
* ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
* DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
*/
import java.io.*;
import java.lang.*;
/**
* class CompactATypeArray : use only on primitive data types
* Provides a compact way to store information that is indexed by Unicode
* values, such as character properties, types, keyboard values, etc.This
* is very useful when you have a block of Unicode data that contains
* significant values while the rest of the Unicode data is unused in the
* application or when you have a lot of redundance, such as where all 21,000
* Han ideographs have the same value. However, lookup is much faster than a
* hash table.
* A compact array of any primitive data type serves two purposes:
* <UL type = round>
* <LI>Fast access of the indexed values.
* <LI>Smaller memory footprint.
* </UL>
* A compact array is composed of a index array and value array. The index
* array contains the indicies of Unicode characters to the value array.
* @see CompactByteArray
* @see CompactIntArray
* @see CompactCharArray
* @see CompactStringArray
* @version %I% %G%
* @author Helena Shih
*/
public final class CompactShortArray implements Serializable {
/**
* The total number of Unicode characters.
*/
public static final int UNICODECOUNT =65536;
/**
* Default constructor for CompactShortArray, the default value of the
* compact array is 0.
*/
public CompactShortArray()
{
this((short)0);
}
/**
* Constructor for CompactShortArray.
* @param defaultValue the default value of the compact array.
*/
public CompactShortArray(short defaultValue)
{
int i;
values = new short[UNICODECOUNT];
indices = new short[INDEXCOUNT];
for (i = 0; i < UNICODECOUNT; ++i) {
values[i] = defaultValue;
}
for (i = 0; i < INDEXCOUNT; ++i) {
indices[i] = (short)(i<<BLOCKSHIFT);
}
isCompact = false;
}
/**
* Constructor for CompactShortArray.
* @param indexArray the indicies of the compact array.
* @param newValues the values of the compact array.
* @exception IllegalArgumentException If the index is out of range.
*/
public CompactShortArray(short indexArray[],
short newValues[]) throws IllegalArgumentException
{
int i;
if (indexArray.length != INDEXCOUNT)
throw new IllegalArgumentException("Index out of bounds.");
for (i = 0; i < INDEXCOUNT; ++i) {
short index = indexArray[i];
if ((index < 0) || (index >= newValues.length+BLOCKCOUNT))
throw new IllegalArgumentException("Index out of bounds.");
}
indices = indexArray;
values = newValues;
}
/**
* Get the mapped value of a Unicode character.
* @param index the character to get the mapped value with
* @return the mapped value of the given character
*/
public short elementAt(char index) // parameterized on short
{
return (values[(indices[index >> BLOCKSHIFT] & 0xFFFF)
+ (index & BLOCKMASK)]);
}
/**
* Set a new value for a Unicode character.
* Set automatically expands the array if it is compacted.
* @param index the character to set the mapped value with
* @param value the new mapped value
*/
public void setElementAt(char index, short value)
{
if (isCompact)
expand();
values[(int)index] = value;
}
/**
* Set new values for a range of Unicode character.
* @param start the starting offset of the range
* @param end the ending offset of the range
* @param value the new mapped value
*/
public void setElementAt(char start, char end, short value)
{
int i;
if (isCompact) {
expand();
}
for (i = start; i <= end; ++i) {
values[i] = value;
}
}
/**
*Compact the array.
*/
public void compact()
{
if (isCompact == false) {
char[] tempIndex;
int tempIndexCount;
short[] tempArray;
short iBlock, iIndex;
// make temp storage, larger than we need
tempIndex = new char[UNICODECOUNT];
// set up first block.
tempIndexCount = BLOCKCOUNT;
for (iIndex = 0; iIndex < BLOCKCOUNT; ++iIndex) {
tempIndex[iIndex] = (char)iIndex;
}; // endfor (iIndex = 0; .....)
indices[0] = (short)0;
// for each successive block, find out its first position
// in the compacted array
for (iBlock = 1; iBlock < INDEXCOUNT; ++iBlock) {
int newCount, firstPosition, block;
block = iBlock<<BLOCKSHIFT;
if (DEBUGSMALL) if (block > DEBUGSMALLLIMIT) break;
firstPosition = FindOverlappingPosition(block, tempIndex,
tempIndexCount);
newCount = firstPosition + BLOCKCOUNT;
if (newCount > tempIndexCount) {
for (iIndex = (short)tempIndexCount;
iIndex < newCount;
++iIndex) {
tempIndex[iIndex]
= (char)(iIndex - firstPosition + block);
} // endfor (iIndex = tempIndexCount....)
tempIndexCount = newCount;
} // endif (newCount > tempIndexCount)
indices[iBlock] = (short)firstPosition;
} // endfor (iBlock = 1.....)
// now allocate and copy the items into the array
tempArray = new short[tempIndexCount];
for (iIndex = 0; iIndex < tempIndexCount; ++iIndex) {
tempArray[iIndex] = values[tempIndex[iIndex]];
}
values = null;
values = tempArray;
isCompact = true;
} // endif (isCompact != false)
}
/** For internal use only. Do not modify the result, the behavior of
* modified results are undefined.
*/
public short getIndexArray()[]
{
return indices;
}
/** For internal use only. Do not modify the result, the behavior of
* modified results are undefined.
*/
public short getStringArray()[]
{
return values;
}
// --------------------------------------------------------------
// package private
// --------------------------------------------------------------
void writeArrays()
{
int i;
int cnt = ((values.length > 0) ? values.length :
(values.length + UNICODECOUNT));
System.out.println("{");
for (i = 0; i < INDEXCOUNT-1; i++)
{
System.out.print("(short)" + (int)((getIndexArrayValue(i) >= 0) ?
(int)getIndexArrayValue(i) :
(int)(getIndexArrayValue(i)+UNICODECOUNT)) + ", ");
if (i != 0)
if (i % 10 == 0)
System.out.println();
}
System.out.println("(short)" +
(int)((getIndexArrayValue(INDEXCOUNT-1) >= 0) ?
(int)getIndexArrayValue(i) :
(int)(getIndexArrayValue(i)+UNICODECOUNT)) +
" }");
System.out.println("{");
for (i = 0; i < cnt-1; i++)
{
System.out.print("(short)" + (int)getArrayValue(i) + ", ");
if (i != 0)
if (i % 10 == 0)
System.out.println();
}
System.out.println("(short)" + (int)getArrayValue(cnt-1) + " }");
}
// Print char Array : Debug only
void printIndex(short start, short count)
{
int i;
for (i = start; i < count; ++i)
{
System.out.println(i + " -> : " +
(int)((indices[i] >= 0) ?
indices[i] :
indices[i] + UNICODECOUNT));
}
System.out.println();
}
void printPlainArray(int start,int count, char[] tempIndex)
{
int iIndex;
if (tempIndex != null)
{
for (iIndex = start; iIndex < start + count; ++iIndex)
{
System.out.print(" " + (int)getArrayValue(tempIndex[iIndex]));
}
}
else
{
for (iIndex = start; iIndex < start + count; ++iIndex)
{
System.out.print(" " + (int)getArrayValue(iIndex));
}
}
System.out.println(" Range: start " + start + " , count " + count);
}
// --------------------------------------------------------------
// private
// --------------------------------------------------------------
/**
* Expanding takes the array back to a 65536 element array.
*/
private void expand()
{
int i;
if (isCompact) {
short[] tempArray;
tempArray = new short[UNICODECOUNT];
for (i = 0; i < UNICODECOUNT; ++i) {
tempArray[i] = elementAt((char)i);
}
for (i = 0; i < INDEXCOUNT; ++i) {
indices[i] = (short)(i<<BLOCKSHIFT);
}
values = null;
values = tempArray;
isCompact = false;
}
}
// # of elements in the indexed array
private short capacity()
{
return (short)values.length;
}
public int storage()
{
return values.length * 2 + indices.length * 2 + 12;
}
private short getArrayValue(int n)
{
return values[n];
}
private short getIndexArrayValue(int n)
{
return indices[n];
}
private int
FindOverlappingPosition(int start, char[] tempIndex, int tempIndexCount)
{
int i;
short j;
short currentCount;
if (DEBUGOVERLAP && start < DEBUGSHOWOVERLAPLIMIT) {
printPlainArray(start, BLOCKCOUNT, null);
printPlainArray(0, tempIndexCount, tempIndex);
}
for (i = 0; i < tempIndexCount; i += BLOCKCOUNT) {
currentCount = (short)BLOCKCOUNT;
if (i + BLOCKCOUNT > tempIndexCount) {
currentCount = (short)(tempIndexCount - i);
}
for (j = 0; j < currentCount; ++j) {
if (values[start + j] != values[tempIndex[i + j]]) break;
}
if (j == currentCount) break;
}
if (DEBUGOVERLAP && start < DEBUGSHOWOVERLAPLIMIT) {
for (j = 1; j < i; ++j) {
System.out.print(" ");
}
printPlainArray(start, BLOCKCOUNT, null);
System.out.println(" Found At: " + i);
}
return i;
}
private static final int DEBUGSHOWOVERLAPLIMIT = 100;
private static final boolean DEBUGTRACE = false;
private static final boolean DEBUGSMALL = false;
private static final boolean DEBUGOVERLAP = false;
private static final int DEBUGSMALLLIMIT = 30000;
private static final int BLOCKSHIFT =7;
private static final int BLOCKCOUNT =(1<<BLOCKSHIFT);
private static final int INDEXSHIFT =(16-BLOCKSHIFT);
private static final int INDEXCOUNT =(1<<INDEXSHIFT);
private static final int BLOCKMASK = BLOCKCOUNT - 1;
private short values[]; // char -> short (char parameterized short)
private short indices[];
private boolean isCompact;
};

View file

@ -0,0 +1,65 @@
package com.ibm.text.utility;
import java.io.IOException;
//import com.ibm.text.unicode.UInfo;
import java.util.*;
import java.io.*;
import java.text.*;
public final class Counter {
Map map = new HashMap();
static public final class RWInteger implements Comparable {
static int uniqueCount;
public int value;
private int forceUnique = uniqueCount++;
// public RWInteger() {
// forceUnique
public int compareTo(Object other) {
RWInteger that = (RWInteger) other;
if (that.value < value) return -1;
else if (that.value > value) return 1;
else if (that.forceUnique < forceUnique) return -1;
else if (that.forceUnique > forceUnique) return 1;
return 0;
}
public String toString() {
return String.valueOf(value);
}
}
public void add(String obj) {
RWInteger count = (RWInteger)map.get(obj);
if (count == null) {
count = new RWInteger();
map.put(obj, count);
}
count.value += obj.length();
}
public Map getSortedByCount() {
Map result = new TreeMap();
Iterator it = map.keySet().iterator();
while (it.hasNext()) {
Object key = it.next();
Object count = map.get(key);
result.put(count, key);
}
return result;
}
public Map getKeyToKey() {
Map result = new HashMap();
Iterator it = map.keySet().iterator();
while (it.hasNext()) {
Object key = it.next();
result.put(key, key);
}
return result;
}
}

View file

@ -0,0 +1,164 @@
package com.ibm.text.utility;
/** Basic Diff program. Compares two sequences of objects fed into it, and
* lets you know where they are different. For a usage example, see DifferTest
* @author Mark Davis
* @version 1.0
*/
final public class Differ {
public static final String copyright =
"Copyright (C) 2000, International Business Machines Corporation and others. All Rights Reserved.";
/**
* @param stackSize The size of the largest difference you expect.
* @param matchCount The number of items that have to be the same to count as a match
*/
public Differ(int stackSize, int matchCount) {
this.STACKSIZE = stackSize;
this.EQUALSIZE = matchCount;
a = new Object[stackSize+matchCount];
b = new Object[stackSize+matchCount];
}
public void add (Object aStr, Object bStr) {
addA(aStr);
addB(bStr);
}
public void addA (Object aStr) {
flush();
a[aCount++] = aStr;
}
public void addB (Object bStr) {
flush();
b[bCount++] = bStr;
}
public int getALine(int offset) {
return aLine + maxSame + offset;
}
public Object getA(int offset) {
if (offset < 0) return last;
if (offset > aTop-maxSame) return next;
return a[offset];
}
public int getACount() {
return aTop-maxSame;
}
public int getBCount() {
return bTop-maxSame;
}
public int getBLine(int offset) {
return bLine + maxSame + offset;
}
public Object getB(int offset) {
if (offset < 0) return last;
if (offset > bTop-maxSame) return next;
return b[offset];
}
public void checkMatch(boolean finalPass) {
// find the initial strings that are the same
int max = aCount;
if (max > bCount) max = bCount;
int i;
for (i = 0; i < max; ++i) {
if (!a[i].equals(b[i])) break;
}
// at this point, all items up to i are equal
maxSame = i;
aTop = bTop = maxSame;
if (maxSame > 0) last = a[maxSame-1];
next = "";
if (finalPass) {
aTop = aCount;
bTop = bCount;
next = "";
return;
}
if (aCount - maxSame < EQUALSIZE || bCount - maxSame < EQUALSIZE) return;
// now see if the last few a's occur anywhere in the b's, or vice versa
int match = find (a, aCount-EQUALSIZE, aCount, b, maxSame, bCount);
if (match != -1) {
aTop = aCount-EQUALSIZE;
bTop = match;
next = a[aTop];
return;
}
match = find (b, bCount-EQUALSIZE, bCount, a, maxSame, aCount);
if (match != -1) {
bTop = bCount-EQUALSIZE;
aTop = match;
next = b[bTop];
return;
}
if (aCount >= STACKSIZE || bCount >= STACKSIZE) {
// flush some of them
aCount = (aCount + maxSame) / 2;
bCount = (bCount + maxSame) / 2;
next = "";
}
}
/** Convenient utility
* finds a segment of the first array in the second array.
* @return -1 if not found, otherwise start position in b
*/
public int find (Object[] a, int aStart, int aEnd, Object[] b, int bStart, int bEnd) {
int len = aEnd - aStart;
int bEndMinus = bEnd - len;
tryA:
for (int i = bStart; i <= bEndMinus; ++i) {
for (int j = 0; j < len; ++j) {
if (!b[i + j].equals(a[aStart + j])) continue tryA;
}
return i; // we have a match!
}
return -1;
}
// ====================== PRIVATES ======================
private void flush() {
if (aTop != 0) {
int newCount = aCount-aTop;
System.arraycopy(a, aTop, a, 0, newCount);
aCount = newCount;
aLine += aTop;
aTop = 0;
}
if (bTop != 0) {
int newCount = bCount-bTop;
System.arraycopy(b, bTop, b, 0, newCount);
bCount = newCount;
bLine += bTop;
bTop = 0;
}
}
private int STACKSIZE;
private int EQUALSIZE;
private Object [] a;
private Object [] b;
private Object last = "";
private Object next = "";
private int aCount = 0;
private int bCount = 0;
private int aLine = 1;
private int bLine = 1;
private int maxSame = 0, aTop = 0, bTop = 0;
}

View file

@ -0,0 +1,37 @@
package com.ibm.text.utility;
public class DifferTest {
public static final String copyright =
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
static final void main(String[] args) { // for testing
String[] as = {"a", "b", "20D4", "0344", "20D5", "20D6", "20D7", "20D8", "20D9"};
String[] bs = {"a", "b", "20D4", "20D5", "0344", "20D6", "20D7", "20D8", "20D9"};
Differ differ = new Differ(50,2);
int max = as.length;
if (max < bs.length) max = bs.length;
for (int j = 0; j <= max; ++j) {
if (j < as.length) differ.addA(as[j]);
if (j < bs.length) differ.addB(bs[j]);
differ.checkMatch(j == max);
if (differ.getACount() != 0 || differ.getBCount() != 0) {
if (differ.getACount() != 0) {
for (int i = -1; i < differ.getACount()+1; ++i) {
System.out.println("a: " + differ.getALine(i) + " " + differ.getA(i));
}
}
if (differ.getBCount() != 0) {
if (differ.getACount() != 0) System.out.println();
for (int i = -1; i < differ.getBCount()+1; ++i) {
System.out.println("b: " + differ.getBLine(i) + " " + differ.getB(i));
}
}
}
System.out.println("----");
//differ.flush();
}
}
}

View file

@ -0,0 +1,52 @@
package com.ibm.text.utility;
import java.awt.*;
import java.io.*;
final public class DualWriter extends Writer {
private static final String copyright = "(C) Copyright IBM Corp. 1998 - All Rights Reserved";
// Abstract class for writing to character streams.
// The only methods that a subclass must implement are
// write(char[], int, int), flush(), and close().
private boolean autoflush ;
private Writer a;
private Writer b;
public DualWriter (Writer a, Writer b) {
this.a = a;
this.b = b;
}
public DualWriter (Writer a, Writer b, boolean autoFlush) {
this.a = a;
this.b = b;
autoflush = autoFlush;
}
public void setAutoFlush(boolean value) {
autoflush = value;
}
public boolean getAutoFlush() {
return autoflush;
}
public void write(char cbuf[],
int off,
int len) throws IOException {
a.write(cbuf, off, len);
b.write(cbuf, off, len);
if (autoflush) flush();
}
public void close() throws IOException {
a.close();
b.close();
}
public void flush() throws IOException {
a.flush();
b.flush();
}
}

View file

@ -0,0 +1,152 @@
package com.ibm.text.utility;
import java.io.*;
import java.util.*;
public class EquivalenceClass {
static final boolean DEBUG = false;
/**
* Takes a many:many relation between source and value.
* Produces equivalence class.
* Two sources are in the same equivalence class any time they share the same value.
*/
// associated with each value, we keep a set of sources.
// whenever we add a <source, value> pair, we see if any sets collide.
// associated with each set of sources, we keep a representative Whenever we add to the set, if we
//
Map sourceToEquiv = new HashMap();
Map valueToRepresentativeSource = new HashMap();
Map forcedMerge = new HashMap();
/**
* @return true if made a difference
*/
String itemSeparator;
int places;
boolean hex;
public EquivalenceClass() {
this(",", 4, true);
}
public EquivalenceClass(String itemSeparator, int places, boolean hex) {
this.itemSeparator = itemSeparator;
this.places = places;
this.hex = hex;
}
public boolean add(Object source, Object value) {
boolean result = false;
Object repSource = valueToRepresentativeSource.get(value);
Set equivSet = (Set)sourceToEquiv.get(source);
Set fm = (Set)forcedMerge.get(source);
if (fm == null) {
fm = new TreeSet();
forcedMerge.put(source, fm);
}
if (DEBUG) System.out.println("+Source " + source
+ ", value: " + value);
if (repSource == null && equivSet == null) {
equivSet = new HashSet();
equivSet.add(source);
sourceToEquiv.put(source, equivSet);
valueToRepresentativeSource.put(value, source);
repSource = source; // for debugging
} else if (equivSet == null) {
equivSet = (Set) sourceToEquiv.get(repSource);
equivSet.add(source);
sourceToEquiv.put(source, equivSet);
result = true;
} else if (repSource == null) {
valueToRepresentativeSource.put(value, source);
repSource = source; // for debugging;
} else { // both non-null
Set repEquiv = (Set) sourceToEquiv.get(repSource);
if (!repEquiv.equals(equivSet)) {
result = true;
if (DEBUG) System.out.println("Merging (" + repSource + ") " + toString(repEquiv)
+ " + (" + source + ") " + toString(equivSet));
// merge!!
// put all items from equivSet into repEquiv
repEquiv.addAll(equivSet);
// now add the values to the forced sets
Iterator it = repEquiv.iterator();
while (it.hasNext()) {
Object n = it.next();
fm = (Set)forcedMerge.get(n);
fm.add(value);
}
// then replace all instances for equivSet by repEquiv
// we have to do this in two steps, since iterators are invalidated by changes
Set toReplace = new HashSet();
it = sourceToEquiv.keySet().iterator();
while (it.hasNext()) {
Object otherSource = it.next();
Set otherSet = (Set) sourceToEquiv.get(otherSource);
if (otherSet == equivSet) {
toReplace.add(otherSource);
}
}
it = toReplace.iterator();
while (it.hasNext()) {
Object otherSource = it.next();
sourceToEquiv.put(otherSource,repEquiv);
}
equivSet = repEquiv; // for debugging
}
}
if (DEBUG) System.out.println("--- repSource: " + repSource
+ ", equivSet: " + equivSet);
return result;
}
public String toString () {
StringBuffer result = new StringBuffer();
// make a set to skip duplicates
Iterator it = new HashSet(sourceToEquiv.values()).iterator();
while (it.hasNext()) {
toString((Set)it.next(), result, forcedMerge);
}
return result.toString();
}
private String toString(Object s) {
if (s == null) return "null";
if (s instanceof Collection) {
StringBuffer sb = new StringBuffer();
toString((Collection)s, sb, null);
return sb.toString();
}
if (hex && s instanceof Number) {
return Utility.hex(s, places);
}
return s.toString();
}
private void toString(Collection s, StringBuffer sb, Map valueToRep) {
if (sb.length() != 0) sb.append(itemSeparator);
if (s == null) {
sb.append("{}");
return;
}
sb.append('{');
Iterator it = s.iterator();
boolean notFirst = false;
while (it.hasNext()) {
if (notFirst) sb.append(", ");
notFirst = true;
Object n = it.next();
sb.append(toString(n));
/*if (valueToRep != null) {
sb.append("(" + toString(valueToRep.get(n)) + ")");
}*/
}
sb.append('}');
}
}

View file

@ -0,0 +1,113 @@
package com.ibm.text.utility;
import java.io.*;
public class IndentWriter extends Writer {
public IndentWriter(Writer writer) {
this.writer = writer;
this.width = 30000;
this.separator = " ";
}
public IndentWriter(OutputStream writer, String encoding)
throws UnsupportedEncodingException{
this.writer = new OutputStreamWriter(writer, encoding);
this.width = 30000;
this.separator = " ";
}
public void setSeparator(String separator) {
this.separator = separator;
}
public String getSeparator() {
return separator;
}
public void setWidth(int width) {
this.width = width;
}
public int getWidth() {
return width;
}
public void indentBy(int indentDelta) throws IOException {
this.indent += indentDelta;
flush();
}
public void setIndent(int indent) {
this.indent = indent;
}
public int getIndent() {
return indent;
}
/*
public void write(String cbuf, int off, int len) throws IOException {
if (buffer.length() + len > width) {
flushLine();
buffer.append(" ".substring(0,indent));
buffer.append("(" + indent + ") ");
} else {
buffer.append(separator);
}
buffer.append(cbuf, off, len);
}
public void write(String string) throws IOException {
write(string,0,string.length());
}
*/
public void write(int indent, String string) throws IOException {
setIndent(indent);
write(string,0,string.length());
}
public void writeln(int indent, String string) throws IOException {
write(indent, string);
flushLine();
}
public void writeln(String string) throws IOException {
write(string);
flushLine();
}
public void writeln() throws IOException {
flushLine();
}
public void write(char cbuf[], int off, int len) throws IOException {
if (buffer.length() == 0) {
bufferIndent = indent;
} else if (bufferIndent + buffer.length() + separator.length() + len > width) {
flushLine();
} else {
buffer.append(separator);
}
buffer.append(cbuf, off, len);
}
public void flushLine() throws IOException {
if (buffer.length() != 0) { // indent
writer.write(" ",0,bufferIndent);
writer.write(buffer.toString());
writer.write(EOL);
buffer.setLength(0);
}
}
public void flush() throws IOException {
flushLine();
writer.flush();
}
public void close() throws IOException {
flush();
writer.close();
}
private Writer writer;
private StringBuffer buffer = new StringBuffer(200);
private int width;
private int indent;
private int bufferIndent;
private String separator;
private static String EOL;
static { // gets platform-specific eol
StringWriter foo = new StringWriter();
PrintWriter fii = new PrintWriter(foo);
fii.println();
fii.flush();
EOL = foo.toString();
}
}

View file

@ -0,0 +1,41 @@
package com.ibm.text.utility;
// =============================================================
// Simple stack mechanism, with push, pop and access
// =============================================================
public final class IntStack {
private int[] values;
private int top = 0;
public IntStack(int initialSize) {
values = new int[initialSize];
}
public void push(int value) {
if (top >= values.length) { // must grow?
int[] temp = new int[values.length*2];
System.arraycopy(values,0,temp,0,values.length);
values = temp;
}
values[top++] = value;
}
public int pop() {
if (top > 0) return values[--top];
throw new IllegalArgumentException("Stack underflow");
}
public int get(int index) {
if (0 <= index && index < top) return values[index];
throw new IllegalArgumentException("Stack index out of bounds");
}
public int getTop() {
return top;
}
public boolean isEmpty() {
return top == 0;
}
}

View file

@ -0,0 +1,13 @@
package com.ibm.text.utility;
import java.util.*;
public final class LengthFirstComparator implements Comparator {
public int compare(Object a, Object b) {
String as = (String) a;
String bs = (String) b;
if (as.length() < bs.length()) return -1;
if (as.length() > bs.length()) return 1;
return as.compareTo(bs);
}
}

View file

@ -0,0 +1,31 @@
package com.ibm.text.utility;
public final class Pair implements java.lang.Comparable {
public Comparable first, second;
public Pair (Comparable first, Comparable second) {
this.first = first;
this.second = second;
}
public int hashCode() {
return first.hashCode() * 37 + second.hashCode();
}
public boolean equals(Object other) {
try {
Pair that = (Pair)other;
return first.equals(that.first) && second.equals(that.second);
} catch (Exception e) {
return false;
}
}
public int compareTo(Object other) {
Pair that = (Pair)other;
int trial = first.compareTo(that.first);
if (trial != 0) return trial;
return second.compareTo(that.second);
}
}

View file

@ -0,0 +1,8 @@
package com.ibm.text.utility;
public class UTF16Plus {
public static int charAt(StringBuffer source, int offset16) {
return UTF32.char32At(source, offset16);
}
}

View file

@ -0,0 +1,718 @@
package com.ibm.text.utility;
/**
* Utility class for demonstrating UTF16 character conversions and indexing conversions.
* Ideally, these methods would be on existing classes in Java, but they can also be used
* in a stand-alone utility class like this one.
* <p>Code that uses strings alone rarely need modification.
* By design, UTF-16 does not allow overlap, so searching for strings is a safe operation.
* Similarly, concatenation is always safe. Substringing is safe if the start and end are both
* on UTF32 boundaries. In normal code, the values for start and end are on those boundaries,
* since they arose from operations like searching.
* If not, the nearest UTF-32 boundaries can be determined using <code>bounds32()</code>.
* <p>Here is a summary of the methods:
* <ul><li>
* <code>char32At()</code>, <code>count32()</code>, and <code>append32()</code>
* are most important methods for most programs.
* They are used for iteration, filtering and copying. See the examples below.
* </li><li>
* <code>bounds32()</code> is useful for finding the nearest UTF-32 boundaries.
* However, in most circumstances it is better to use
* <a <a href="http://java.sun.com/products/jdk/1.2/docs/api/java/text/BreakIterator.html#getCharacterInstance(java.util.Locale)">
* BreakIterator.getCharacterInstance(Locale)</a> to find character boundaries
* that are closer to end-user expectations.
* </li><li>
* <code>valueOf32()</code> is occasionally convenient for producing a string containing a UTF-32 value.
* </li><li>
* <code>findOffset16()</code> and <code>findOffset32()</code> are generally not needed,
* except when interfacing to specifications that use UTF-32 indices (such as XSL).
* </li><li>
* <code>isLegal()</code> can be used to test whether UTF-16 or UTF-32 values are valid.
* </li><li>
* <code>isLeadSurrogate()</code>, <code>isSurrogate()</code>, and <code>isTrailSurrogate()</code>
* test the type of a char. They are useful for lower-level code.
* </li><li>
* <code>getChar32()</code>, <code>getLead()</code>, and <code>getTrail()</code>
* are sometimes useful for putting together and taking apart UTF-32 values.
* </li></ul>
* <strong>Examples:</strong>
* <p>The following examples illustrate use of some of these methods.
<pre>
// iteration forwards: Original
for (int i = 0; i < s.length(); ++i) {
    char ch = s.charAt(i);
    doSomethingWith(ch);
}
// iteration forwards: Changes for UTF-32
int ch;
for (int i = 0; i < s.length(); i+=UTF32.count16(ch)) {
    ch = UTF32.char32At(s,i);
    doSomethingWith(ch);
}
// iteration backwards: Original
for (int i = s.length()-1; i >= 0; --i) {
    char ch = s.charAt(i);
    doSomethingWith(ch);
}
// iteration backwards: Changes for UTF-32
int ch;
for (int i = s.length()-1; i > 0; i-=UTF32.count16(ch)) {
    ch = UTF32.char32At(s,i);
    doSomethingWith(ch);
}
* </pre>
* <strong>Notes:</strong>
* <ul><li>
* <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code> and <code>Trail</code> in the API,
* which gives a better sense of their ordering in a string. <code>offset16</code> and <code>offset32</code> are used to distinguish
* offsets to UTF-16 boundaries vs offsets to UTF-32 boundaries.
* <code>int char32</code> is used to contain UTF-32 characters, as opposed to <code>char</code>, which is a UTF-16 code unit.
* </li><li>
* <strong>Roundtripping Offsets:</strong> You can always roundtrip
* from a UTF-32 offset to a UTF-16 offset and back.
* Because of the difference in structure, you can roundtrip
* from a UTF-16 offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>.
* </li><li>
* <strong>Exceptions:</strong> The error checking will throw an exception if indices are out of bounds.
* Other than than that, all methods will behave reasonably,
* even if unmatched surrogates or out-of-bounds UTF-32 values are present.
* <code>isLegal()</code> can be used to check for validity if desired.
* </li><li>
* <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then these are
* counted as one UTF-32 value. This matches their iteration behavior, which is vital.
* It also matches common display practice as
* missing glyphs (see the Unicode Standard Section 5.4, 5.5).
* </li><li>
* <strong>Out-of-bounds UTF-32 values:</strong> If a <code>char32</code> contains an out-of-bounds UTF-32 value,
* then it is treated as REPLACEMENT_CHAR for consistency across the API.
* </li><li>
* <strong>Optimization:</strong> The method implementations may need optimization if the compiler doesn't fold static final methods.
* Since surrogate pairs will form an exceeding small percentage of all the text in the world,
* the singleton case should always be optimized for.
* </li></ul>
* @author Mark Davis, with help from Markus Scherer
*/
public final class UTF32 {
// =========================================================
// UTILITIES
// =========================================================
/**
* Unicode value used when translating into Unicode encoding form
* and there is no existing character.
*/
public static final char REPLACEMENT_CHAR = '\uFFFD';
/**
* Value returned in <code><a href="#bounds32(java.lang.String, int)">bounds32()</a></code>.
*/
public static final int SINGLE = 1, LEAD = 2, TRAIL = 5;
/**
* Determines how many chars this char32 requires.
* If a validity check is required, use <code><a href="#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* <p><i>If this were integrated into the Java API, it could be a static method of either Character or String.</i>
* @return 2 if is in surrogate space, otherwise 1.
* @param ch the input character.
*/
public static int count16(int char32) {
if (char32 < MIN_SUPPLEMENTARY) return 1;
return 2;
}
/**
* Extract a single UTF-32 value from a string.
* Used when iterating forwards or backwards (with <code>count16()</code>, as well as random access.
* If a validity check is required, use <code><a href="#isLegal(char)">isLegal()</a></code> on the return value.
* <p><i>If this were integrated into the Java API, it could be a method of String, StringBuffer and possibly CharacterIterator.</i>
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
* The boundaries of that codepoint are the same as in <code>bounds32()</code>.
* @param source array of UTF-16 chars
* @param offset16 UTF-16 offset to the start of the character.
*/
public static int char32At(String source, int offset16) {
char single = source.charAt(offset16);
if (!isSurrogate(single)) return single;
try { // use exception to catch out-of-bounds
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is low,
// look both directions.
if (isLeadSurrogate(single)) {
char trail = source.charAt(++offset16);
if (isTrailSurrogate(trail)) {
return ((int)single << SURROGATE_SHIFT) + trail + SURROGATE_OFFSET;
}
} else { // isTrailSurrogate(single), so
char lead = source.charAt(--offset16);
if (isLeadSurrogate(lead)) {
return ((int)lead << SURROGATE_SHIFT) + single + SURROGATE_OFFSET;
}
}
} catch (StringIndexOutOfBoundsException e) {}
return single; // return unmatched surrogate
}
public static int char32At(StringBuffer source, int offset16) {
char single = source.charAt(offset16);
if (!isSurrogate(single)) return single;
try { // use exception to catch out-of-bounds
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is low,
// look both directions.
if (isLeadSurrogate(single)) {
char trail = source.charAt(++offset16);
if (isTrailSurrogate(trail)) {
return ((int)single << SURROGATE_SHIFT) + trail + SURROGATE_OFFSET;
}
} else { // isTrailSurrogate(single), so
char lead = source.charAt(--offset16);
if (isLeadSurrogate(lead)) {
return ((int)lead << SURROGATE_SHIFT) + single + SURROGATE_OFFSET;
}
}
} catch (StringIndexOutOfBoundsException e) {}
return single; // return unmatched surrogate
}
public static int char32At(char[] source, int start16, int end16, int offset16) {
if (offset16 < start16 || offset16 >= end16) {
throw new ArrayIndexOutOfBoundsException(offset16);
}
char single = source[offset16];
if (!isSurrogate(single)) return single;
try { // use exception to catch out-of-bounds
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is low,
// look both directions.
if (isLeadSurrogate(single)) {
++offset16;
if (offset16 >= end16) return single;
char trail = source[offset16];
if (isTrailSurrogate(trail)) {
return ((int)single << SURROGATE_SHIFT) + trail + SURROGATE_OFFSET;
}
} else { // isTrailSurrogate(single), so
char lead = source[--offset16];
if (isLeadSurrogate(lead)) {
return ((int)lead << SURROGATE_SHIFT) + single + SURROGATE_OFFSET;
}
}
} catch (ArrayIndexOutOfBoundsException e) {}
return single; // return unmatched surrogate
}
// moral equivalent of valueOf32(charAt32(x)), but no memory alloc
public static String getCodePointSubstring(String s, int offset16) {
switch(bounds32(s,offset16)) {
default: return s.substring(offset16,offset16+1);
case LEAD: return s.substring(offset16,offset16+2);
case TRAIL: return s.substring(offset16-1,offset16+1);
}
}
// moral equivalent of valueOf32(charAt32(x)), but no memory alloc
public static String getCodePointSubstring(StringBuffer s, int offset16) {
switch(bounds32(s,offset16)) {
default: return s.substring(offset16,offset16+1);
case LEAD: return s.substring(offset16,offset16+2);
case TRAIL: return s.substring(offset16-1,offset16+1);
}
}
public static int append32(char[] output, int oPosition, int oEnd, int cp) {
if (oPosition >= oEnd) throw new ArrayIndexOutOfBoundsException(oPosition);
output[oPosition++] = UTF32.getLead(cp);
if (UTF32.count16(cp) != 1) {
if (oPosition >= oEnd) throw new ArrayIndexOutOfBoundsException(oPosition);
output[oPosition++] = UTF32.getTrail(cp);
}
return oPosition;
}
public static void setChar32At(StringBuffer b, int position, int codePoint) {
int type = bounds32(b, position);
// handle simple cases: #chars at position match #chars in codePoint
int end = position;
switch (type) {
case SINGLE:
if (isSupplementary(codePoint)) break;
b.setCharAt(position, (char)codePoint);
return;
case LEAD:
if (!isSupplementary(codePoint)) {
++end;
break;
}
b.setCharAt(position++, (char)getLead(codePoint));
b.setCharAt(position, (char)getTrail(codePoint));
return;
case TRAIL:
if (!isSupplementary(codePoint)) {
--position;
break;
}
b.setCharAt(position++, (char)getLead(codePoint));
b.setCharAt(position, (char)getTrail(codePoint));
return;
}
// mismatch, just use long form
b.replace(position, end+1, valueOf32(codePoint));
}
/**
* See if a char value is legal. It can't be:
* <ul><li>Not-a-character (either \\uFFFF or\\uFFFE).
* The datatype char itself prevents out of bounds errors.
* </li></ul>
* Note: legal does not mean that it is assigned in this version of Unicode.
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
* @param UTF-32 value to test
* @return true iff legal.
*/
public static boolean isLegal(char char16) {
return (char16 < 0xFFFE);
}
/**
* See if a UTF32 value is legal. It can't be:
* <ul>
* <li>Out of bounds (less than 0 or greater than MAX_UNICODE)</li>
* <li>A surrogate value (00D800 to 00DCFF)</li>
* <li>Not-a-character (of the form xxFFFF or xxFFFE)</li>
* </ul>
* Note: legal does not mean that it is assigned in this version of Unicode.
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
* @param char32 UTF-32 value to test
* @return true iff legal.
*/
public static boolean isLegal(int char32) {
if (char32 < 0) return false;
//if (char32 < SURROGATE_BASE) return true;
//if (char32 < SURROGATE_LIMIT) return false;
if ((char32 & PLANE_MASK) >= NON_CHARACTER_BASE) return false;
return (char32 <= MAX_UNICODE);
}
/**
* Determines whether the code unit OR code point is a surrogate.
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
* @return true iff the input character is a surrogate.
* @param ch the input character.
*/
public static boolean isSurrogate(int char32) {
return (SURROGATE_BASE <= char32 && char32 < SURROGATE_LIMIT);
}
/**
* Determines whether the code point is a supplementary.
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
* @return true iff the input character is a surrogate.
* @param ch the input character.
*/
public static boolean isSupplementary(int char32) {
return (char32 >= MIN_SUPPLEMENTARY && char32 <= MAX_UNICODE);
}
/**
* Determines whether the code point is a supplementary.
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
* @return true iff the input character is a surrogate.
* @param ch the input character.
*/
public static boolean isBasic(int char32) {
return (char32 >= 0 && char32 < MIN_SUPPLEMENTARY);
}
/**
* Determines whether the character is a trail surrogate.
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
* @return true iff the input character is a trail surrogate.
* @param ch the input character.
*/
public static boolean isTrailSurrogate(char ch) {
return (TRAIL_BASE <= ch && ch < TRAIL_LIMIT);
}
/**
* Determines whether the character is a lead surrogate.
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
* @return true iff the input character is a lead surrogate.
* @param ch the input character.
*/
public static boolean isLeadSurrogate(char ch) {
return (LEAD_BASE <= ch && ch < LEAD_LIMIT);
}
/**
* Returns the lead surrogate.
* If a validity check is required, use <code><a href="#isLegal(char)">isLegal()</a></code> on char32 before calling.
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
* @return lead surrogate if the count16(ch) is 2;
* <br>otherwise the character itself
* @param char32 the input character.
*/
public static char getLead(int char32) {
if (char32 >= MIN_SUPPLEMENTARY) {
return (char)(LEAD_BASE_OFFSET + (char32 >> SURROGATE_SHIFT));
}
return (char)char32;
}
/**
* Returns the trail surrogate.
* If a validity check is required, use <code><a href="#isLegal(char)">isLegal()</a></code> on char32 before calling.
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
* @return the trail surrogate if the count16(ch) is 2;
* <br>and 0 otherwise (note: 0 is not a valid lead surrogate).
* @param char32 the input character.
*/
public static char getTrail(int char32) {
if (char32 >= MIN_SUPPLEMENTARY) {
return (char)(TRAIL_BASE + (char32 & TRAIL_MASK));
}
return '\u0000';
}
/**
* Convenience method corresponding to String.valueOf(char). It returns a one or two char string containing
* the UTF-32 value. If the input value can't be converted, it substitutes REPLACEMENT_CHAR.
* If a validity check is required, use <code><a href="#isLegal(char)">isLegal()</a></code> before calling.
* <p><i>If this were integrated into the Java API, it could be a static method of String.</i>
* @return string value of char32
* @param ch the input character.
*/
public static String valueOf32(int char32) {
if (char32 < 0 || MAX_UNICODE < char32) return String.valueOf(REPLACEMENT_CHAR);
if (char32 < MIN_SUPPLEMENTARY) return String.valueOf((char)char32);
synchronized (buf2) { // saves allocations
buf2[0] = (char)(LEAD_BASE_OFFSET + (char32 >> SURROGATE_SHIFT));
buf2[1] = (char)(TRAIL_BASE + (char32 & TRAIL_MASK));
return String.valueOf(buf2);
}
}
private static char[] buf2 = new char[2]; // used to avoid allocations
/**
* Returns the UTF-32 character corresponding to the two chars.
* If a validity check is required, check the arguments with
* <code>isLeadSurrogate()</code> and <code>isTrailSurrogate()</code>, respectively before calling.
* <p><i>If this were integrated into the Java API, it could be a static method of String or Character.</i>
* @return the UTF-32 character, or REPLACEMENT_CHAR if invalid.
* @param lead the lead char
* @param lead the trail char
*/
public static int getChar32(char lead, char trail) {
if (isLeadSurrogate(lead) && isTrailSurrogate(trail)) {
return (lead <<= SURROGATE_SHIFT) + trail + SURROGATE_OFFSET;
}
return REPLACEMENT_CHAR;
}
/**
* Returns the type of the UTF32 boundaries around the char at offset16.
* Used for random access.
* <p><i>If this were integrated into the Java API, it could be a method of String, StringBuffer and possibly CharacterIterator.</i>
* @return SINGLE, FIRST, or SECOND:
* <ul><li>
* SINGLE: a single char; the bounds are [offset16, offset16+1]
* </li><li>
* LEAD: a surrogate pair starting at offset16; the bounds are [offset16, offset16+2]
* </li><li>
* TRAIL: a surrogate pair starting at offset16-1; the bounds are [offset16-1, offset16+1]
* </ul>
* For bit-twiddlers, the return values for these are chosen so that the boundaries can be gotten by:
* [offset16 - (value>>2), offset16 + (value&3)].
* @param source text to analyse
* @param offset16 UTF-16 offset
* @exception StringIndexOutOfBoundsException if offset16 is out of bounds.
*/
public static int bounds32(String source, int offset16) {
char ch = source.charAt(offset16);
if (isSurrogate(ch)) {
if (isLeadSurrogate(ch)) {
if (++offset16 < source.length()
&& isTrailSurrogate(source.charAt(offset16))) return LEAD;
} else { // isTrailSurrogate(ch), so
if (--offset16 >= 0
&& isLeadSurrogate(source.charAt(offset16))) return TRAIL;
}
}
return SINGLE;
}
public static int bounds32(StringBuffer source, int offset16) {
char ch = source.charAt(offset16);
if (isSurrogate(ch)) {
if (isLeadSurrogate(ch)) {
if (++offset16 < source.length()
&& isTrailSurrogate(source.charAt(offset16))) return LEAD;
} else { // isTrailSurrogate(ch), so
if (--offset16 >= 0
&& isLeadSurrogate(source.charAt(offset16))) return TRAIL;
}
}
return SINGLE;
}
// should be renamed bounds
public static int bounds32(char[] source, int oStart, int oEnd, int offset16) {
if (offset16 < oStart || offset16 >= oEnd) {
throw new ArrayIndexOutOfBoundsException(offset16);
}
char ch = source[offset16];
if (isSurrogate(ch)) {
if (isLeadSurrogate(ch)) {
if (++offset16 < oEnd
&& isTrailSurrogate(source[offset16])) return LEAD;
} else { // isTrailSurrogate(ch), so
if (--offset16 >= oStart
&& isLeadSurrogate(source[offset16])) return TRAIL;
}
}
return SINGLE;
}
/**
* Returns the UTF-16 offset that corresponds to a UTF-32 offset.
* Used for random access. See the <a name="_top_">class description</a>
* for notes on roundtripping.
* <p><i>If this were integrated into the Java API, it could be a method of String, StringBuffer and possibly CharacterIterator.</i>
* @return UTF-16 offset
* @param offset32 UTF-32 offset
* @param source text to analyse
* @exception StringIndexOutOfBoundsException if offset32 is out of bounds.
*/
public static int findOffset16(String source, int offset32) {
int remaining = offset32; // for decrementing
boolean hadLeadSurrogate = false;
int i;
for (i = 0; remaining > 0 && i < source.length(); ++i) {
char ch = source.charAt(i);
if (hadLeadSurrogate && isTrailSurrogate(ch)) {
hadLeadSurrogate = false; // count valid trail as zero
} else {
hadLeadSurrogate = isLeadSurrogate(ch);
--remaining; // count others as 1
}
}
// if we didn't use up all of remaining (or if we started < 0)
// then it is beyond the bounds
if (remaining != 0) throw new StringIndexOutOfBoundsException(offset32);
// special check for last surrogate if needed, for consistency with
// other situations
if (hadLeadSurrogate && i < source.length() && isTrailSurrogate(source.charAt(i))) {
++i; // grab extra unicode
}
return i;
}
/**
* Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given UTF-16 offset.
* Used for random access. See the <a name="_top_">class description</a>
* for notes on roundtripping.
* <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then
* the UTF-32 offset of the <strong>end</strong> of the pair is returned.</i>
* <p>To find the UTF-32 length of a string, use:
* <pre>
* len32 = getOffset32(source, source.length());
* </pre>
* <p><i>If this were integrated into the Java API, it could be a methods of String, StringBuffer and possibly CharacterIterator.</i>
* @return UTF-32 offset
* @param source text to analyse
* @param offset16 UTF-16 offset
* @exception StringIndexOutOfBoundsException if offset16 is out of bounds.
*/
public static int findOffset32(String source, int offset16) {
int result = 0;
boolean hadLeadSurrogate = false;
for (int i = 0; i < offset16; ++i) {
char ch = source.charAt(i);
if (hadLeadSurrogate && isTrailSurrogate(ch)) {
hadLeadSurrogate = false; // count valid trail as zero
} else {
hadLeadSurrogate = isLeadSurrogate(ch);
++result; // count others as 1
}
}
return result;
}
public static int length32(String source) {
return findOffset32(source, source.length());
}
/**
* Append a single UTF-32 value to the end of a StringBuffer.
* If a validity check is required, use <code><a href="#isLegal(char)">isLegal()</a></code> on char32 before calling.
* <p><i>If this were integrated into the Java API, it could be a method of StringBuffer.</i>
* @param char32 value to append. If out of bounds, substitutes REPLACEMENT_CHAR.
* @param target string to add to
*/
public static void append32(StringBuffer target, int char32) {
// Check for irregular values
if (char32 < 0 || char32 > MAX_UNICODE) char32 = REPLACEMENT_CHAR;
// Write the UTF-16 values
if (char32 >= MIN_SUPPLEMENTARY) {
target.append((char)(LEAD_BASE_OFFSET + (char32 >> SURROGATE_SHIFT)));
target.append((char)(TRAIL_BASE + (char32 & TRAIL_MASK)));
} else {
target.append((char)char32);
}
}
/**
* Compare strings using Unicode code point order, instead of UTF-16 code unit order.
*/
public static final class StringComparator implements java.util.Comparator {
/**
* Standard String compare. Only one small section is different, marked in the code.
*/
public int compare(Object a, Object b) {
if (a == b) {
return 0;
}
if (a == null) {
return -1;
} else if (b == null) {
return 1;
}
String sa = (String) a;
String sb = (String) b;
int lena = sa.length();
int lenb = sb.length();
int len = lena;
if (len > lenb) len = lenb;
for (int i = 0; i < len; ++i) {
char ca = sa.charAt(i);
char cb = sb.charAt(i);
if (ca == cb) continue; // skip remap if equal
// start of only different section
if (ca >= 0xD800) { // reshuffle to get right codepoint order
ca += (ca < 0xE000) ? 0x2000 : -0x800;
}
if (cb >= 0xD800) { // reshuffle to get right codepoint order
cb += (cb < 0xE000) ? 0x2000 : -0x800;
}
// end of only different section
if (ca < cb) return -1;
return 1; // wasn't equal, so return 1
}
if (lena < lenb) return -1;
if (lena > lenb) return 1;
return 0;
}
}
// ===========================================================
// PRIVATES
// ===========================================================
/**
* Prevent instance from being created.
*/
private UTF32() {}
/**
* Maximum code point values for UTF-32.
*/
private static final int MAX_UNICODE = 0x10FFFF;
/**
* Maximum values for Basic code points (BMP).
*/
private static final int MAX_BASIC = 0xFFFF;
/**
* Minimum value for Supplementary code points (SMP).
*/
private static final int MIN_SUPPLEMENTARY = 0x10000;
/**
* Used to mask off single plane in checking for NON_CHARACTER
*/
private static final int PLANE_MASK = 0xFFFF;
/**
* Range of non-characters in each plane
*/
private static final int
NON_CHARACTER_BASE = 0xFFFE,
NON_CHARACTER_END = 0xFFFF;
// useful statics and tables for fast lookup
/**
* Values for surrogate detection. X is a surrogate iff X & SURROGATE_MASK == SURROGATE_MASK.
*/
static final int SURROGATE_MASK = 0xD800;
/**
* Bottom 10 bits for use in surrogates.
*/
private static final int TRAIL_MASK = 0x3FF;
/**
* Shift value for surrogates.
*/
private static final int SURROGATE_SHIFT = 10;
/**
* Lead surrogates go from LEAD_BASE up to LEAD_LIMIT-1.
*/
private static final int LEAD_BASE = 0xD800, LEAD_LIMIT = 0xDC00;
/**
* Trail surrogates go from TRAIL_BASE up to TRAIL_LIMIT-1.
*/
private static final int TRAIL_BASE = 0xDC00, TRAIL_LIMIT = 0xE000;
/**
* Surrogates go from SURROGATE_BASE up to SURROGATE_LIMIT-1.
*/
private static final int SURROGATE_BASE = 0xD800, SURROGATE_LIMIT = 0xE000;
/**
* Any codepoint at or greater than SURROGATE_SPACE_BASE requires 2 16-bit code units.
*/
//private static final int SURROGATE_SPACE_BASE = 0x10000;
/**
* Offset to add to combined surrogate pair to avoid masking.
*/
private static final int SURROGATE_OFFSET = MIN_SUPPLEMENTARY
- (LEAD_BASE << SURROGATE_SHIFT) - TRAIL_BASE;
private static final int LEAD_BASE_OFFSET = LEAD_BASE - (MIN_SUPPLEMENTARY >> SURROGATE_SHIFT);
};

View file

@ -0,0 +1,177 @@
package com.ibm.text.utility;
import java.io.Reader;
import java.io.InputStream;
import java.io.IOException;
/**
* Utility class that writes UTF8.<br>
* Main purpose is to supplant OutputStreamWriter(x, "UTF8"), since that has serious errors.
* <br>
* Example of Usage:
* <pre>
* PrintWriter log = new PrintWriter(
* new UTF8StreamWriter(new FileOutputStream(fileName), 32*1024));
* </pre>
* NB: unsynchronized for simplicity and speed. The same object must NOT be used in multiple threads.
*/
// TODO: Fix case of surrogate pair crossing input buffer boundary
public final class UTF8StreamReader extends Reader {
private InputStream input;
private boolean checkIrregular = true;
UTF8StreamReader(InputStream stream, int buffersize) {
if (buffersize < 1) {
throw new IllegalArgumentException("UTF8StreamReader buffersize must be >= 1");
}
input = stream;
bBuffer = new byte[buffersize];
}
private static final int MAGIC = 0x10000 + ((0 - 0xD800) << 10) + (0 - 0xDC00);
private byte[] bBuffer; // do a bit of buffering ourselves for efficiency
private int
bIndex = 0,
bEnd = 0,
bRemaining = 0,
currentPoint = 0,
lastPoint,
shortestFormTest = 0;
private char cCarry = 0;
private static final byte[] BYTES_REMAINING = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 5-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 7-
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, // 8-
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, // 9-
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, // A-
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, // B-
-1,-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D-
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // E-
3, 3, 3, 3, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1 // F-
};
public int read(char cbuf[], int off, int len) throws IOException {
// check input arguments
if (len <= 0) return 0;
if (off > len) return 0;
int cIndex = off;
int cEnd = off + len;
// if we had a low surrogate from the last call, get it first
if (cCarry != 0 && len > 0) {
cbuf[cIndex++] = cCarry;
cCarry = 0;
}
// now loop, filling in the output
while (cIndex < cEnd) {
// get more bytes if we run out
if (bIndex >= bEnd) {
bIndex = 0;
bEnd = input.read(bBuffer, 0, bBuffer.length);
if (bEnd < 0) {
if (cIndex == off) return -1;
return cIndex - off;
}
}
// process the current byte (mask because Java doesn't have unsigned byte)
int b = bBuffer[bIndex++] & 0xFF;
switch (bRemaining) {
// First Byte case
case 0:
bRemaining = BYTES_REMAINING[b];
switch (bRemaining) {
case 0:
cbuf[cIndex++] = (char) (lastPoint = b);
break;
case 1:
currentPoint = b & 0x1F;
shortestFormTest = 0x80;
break;
case 2:
currentPoint = b & 0xF;
shortestFormTest = 0x800;
break;
case 3:
currentPoint = b & 0x7;
shortestFormTest = 0x10000;
break;
default:
throw new IllegalArgumentException("illegal lead code unit: " + b);
}
break;
// Trailing bytes
case 2: case 3:
b ^= 0x80;
if (b > 0x3F) {
throw new IllegalArgumentException("illegal trail code unit: " + (b ^ 0x80));
}
currentPoint = (currentPoint << 6) | b;
--bRemaining;
break;
// Last trailing byte, time to assemble
case 1:
b ^= 0x80;
if (b > 0x3F) {
throw new IllegalArgumentException("illegal trail code unit: " + (b ^ 0x80));
}
currentPoint = (currentPoint << 6) | b;
--bRemaining;
// we have gotten the code, so check and stash it
if (currentPoint < shortestFormTest) {
throw new IllegalArgumentException("illegal sequence, not shortest form: " + currentPoint);
}
if (checkIrregular && 0xD800 <= lastPoint && lastPoint <= 0xDC00
&& 0xDC00 <= currentPoint && currentPoint <= 0xDFFF) {
throw new IllegalArgumentException("irregular sequence, surrogate pair: " + currentPoint);
}
lastPoint = currentPoint;
if (currentPoint >= 0x10000) {
if (currentPoint > 0x10FFFF) {
throw new IllegalArgumentException("illegal code point, too large: " + currentPoint);
}
currentPoint -= 0x10000;
cbuf[cIndex++] = (char)(0xD800 + (currentPoint >> 10));
currentPoint = 0xDC00 + (currentPoint & 0x3FF);
if (cIndex >= cEnd) {
cCarry = (char)currentPoint;
return cIndex - off;
}
}
cbuf[cIndex++] = (char)currentPoint;
currentPoint = 0;
break;
}
}
return cIndex - off;
}
public void close() throws IOException {
input.close();
}
}

View file

@ -0,0 +1,147 @@
package com.ibm.text.utility;
import java.io.*;
/**
* Utility class that writes UTF8.<br>
* Main purpose is to supplant OutputStreamWriter(x, "UTF8"), since that has serious errors.
* <br>
* Example of Usage:
* <pre>
* PrintWriter log = new PrintWriter(
* new UTF8StreamWriter(new FileOutputStream(fileName), 32*1024));
* </pre>
* NB: unsynchronized for simplicity and speed. The same object must NOT be used in multiple threads.
*/
// TODO: Fix case of surrogate pair crossing input buffer boundary
public final class UTF8StreamWriter extends Writer {
private OutputStream output;
private byte[] bBuffer; // do a bit of buffering ourselves for efficiency
private int bSafeEnd;
private int bEnd;
private int bIndex = 0;
private int highSurrogate = 0;
public UTF8StreamWriter(OutputStream stream, int buffersize) {
if (buffersize < 5) {
throw new IllegalArgumentException("UTF8StreamWriter buffersize must be >= 5");
}
output = stream;
bBuffer = new byte[buffersize];
bEnd = buffersize;
bSafeEnd = buffersize - 4;
}
private static final int
NEED_2_BYTES = 1<<7,
NEED_3_BYTES = 1<<(2*5 + 1),
NEED_4_BYTES = 1<<(3*5 + 1);
private static final int
TRAILING_BOTTOM_MASK = 0x3F,
TRAILING_TOP = 0x80;
private static final int MAGIC = 0x10000 + ((0 - 0xD800) << 10) + (0 - 0xDC00);
public final void write(char[] buffer, int cStart, int cLength) throws IOException {
int cEnd = cStart + cLength;
while (cStart < cEnd) {
// write if we need to
if (bIndex > bSafeEnd) {
output.write(bBuffer, 0, bIndex);
bIndex = 0;
}
// get code point
int utf32 = buffer[cStart++];
// special check for surrogates
if (highSurrogate != 0) {
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
writeCodePoint((highSurrogate << 10) + utf32 + MAGIC);
highSurrogate = 0;
continue;
}
writeCodePoint(highSurrogate);
highSurrogate = 0;
}
if (0xD800 <= utf32 && utf32 <= 0xDBFF) {
highSurrogate = utf32;
continue;
}
// normal case
writeCodePoint(utf32);
}
}
private final void writeCodePoint(int utf32) {
// convert to bytes
if (utf32 < NEED_2_BYTES) {
bBuffer[bIndex++] = (byte)utf32;
return;
}
// Find out how many bytes we need to write
// At this point, it is at least 2.
//int count;
int backIndex;
int firstByteMark;
if (utf32 < NEED_3_BYTES) {
backIndex = bIndex += 2;
firstByteMark = 0xC0;
} else if (utf32 < NEED_4_BYTES) {
backIndex = bIndex += 3;
firstByteMark = 0xE0;
bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK));
utf32 >>= 6;
} else {
backIndex = bIndex += 4;
firstByteMark = 0xF0;
bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK));
utf32 >>= 6;
bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK));
utf32 >>= 6;
};
bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK));
utf32 >>= 6;
bBuffer[--backIndex] = (byte)(firstByteMark | utf32);
}
private void internalFlush() throws IOException {
if (highSurrogate != 0) {
if (bIndex > bEnd) {
output.write(bBuffer, 0, bIndex);
bIndex = 0;
}
writeCodePoint(highSurrogate);
highSurrogate = 0;
}
// write buffer if we need to
if (bIndex != 0) {
output.write(bBuffer, 0, bIndex);
bIndex = 0;
}
}
public void close() throws IOException {
internalFlush();
output.close();
}
public void flush() throws IOException {
internalFlush();
output.flush();
}
}

View file

@ -0,0 +1,443 @@
package com.ibm.text.utility;
import java.util.*;
import java.text.*;
import java.io.*;
public final class Utility { // COMMON UTILITIES
static final boolean UTF8 = true; // TODO -- make argument
public static String getName(int i, String[] names) {
try {
return names[i];
} catch (Exception e) {
return "UNKNOWN";
}
}
private static boolean needCRLF = false;
public static void dot(int i) {
if ((i % 0x7FF) == 0) {
needCRLF = true;
System.out.print('.');
}
}
public static void fixDot() {
if (needCRLF) {
System.out.println();
needCRLF = false;
}
}
public static int setBits(int source, int start, int end) {
if (start < end) {
int temp = start;
start = end;
end = temp;
}
int bmstart = (1 << (start+1)) - 1;
int bmend = (1 << end) - 1;
bmstart &= ~bmend;
return source |= bmstart;
}
public static int setBit(int source, int start) {
return setBits(source, start, start);
}
public static int clearBits(int source, int start, int end) {
if (start < end) {
int temp = start;
start = end;
end = temp;
}
int bmstart = (1 << (start+1)) - 1;
int bmend = (1 << end) - 1;
bmstart &= ~bmend;
return source &= ~bmstart;
}
public static int clearBit(int source, int start) {
return clearBits(source, start, start);
}
public static int find(String source, String[] target) {
for (int i = 0; i < target.length; ++i) {
if (source.equalsIgnoreCase(target[i])) return i;
}
return -1;
}
public static byte lookup(String source, String[] target) {
int result = Utility.find(source, target);
if (result != -1) return (byte)result;
throw new ChainException("Could not find \"{0}\" in table [{1}]", new Object [] {source, target});
}
/**
* Supplies a zero-padded hex representation of an integer (without 0x)
*/
static public String hex(long i, int places) {
if (i == Long.MIN_VALUE) return "-8000000000000000";
boolean negative = i < 0;
if (negative) {
i = -i;
}
String result = Long.toString(i, 16).toUpperCase();
if (result.length() < places) {
result = "0000000000000000".substring(result.length(),places) + result;
}
if (negative) {
return '-' + result;
}
return result;
}
public static String hex(long ch) {
return hex(ch,4);
}
public static String hex(Object s) {
return hex(s, 4, " ");
}
public static String hex(Object s, int places) {
return hex(s, places, " ");
}
public static String hex(Object s, String separator) {
return hex(s, 4, separator);
}
public static String hex(Object o, int places, String separator) {
if (o == null) return "";
if (o instanceof Number) return hex(((Number)o).longValue(), places);
String s = o.toString();
StringBuffer result = new StringBuffer();
int ch;
for (int i = 0; i < s.length(); i += UTF32.count16(ch)) {
if (i != 0) result.append(separator);
ch = UTF32.char32At(s, i);
result.append(hex(ch));
}
return result.toString();
}
public static String hex(byte[] o, int start, int end) {
StringBuffer result = new StringBuffer();
//int ch;
for (int i = start; i < end; ++i) {
if (i != 0) result.append(' ');
result.append(hex(o[i] & 0xFF, 2));
}
return result.toString();
}
public static String hex(char[] o, int start, int end) {
StringBuffer result = new StringBuffer();
for (int i = start; i < end; ++i) {
if (i != 0) result.append(' ');
result.append(hex(o[i], 4));
}
return result.toString();
}
public static String repeat(String s, int count) {
if (count <= 0) return "";
if (count == 1) return s;
StringBuffer result = new StringBuffer(count*s.length());
for (int i = 0; i < count; ++i) {
result.append(s);
}
return result.toString();
}
public static int intFrom(String p) {
if (p.length() == 0) return Short.MIN_VALUE;
return Integer.parseInt(p);
}
public static float floatFrom(String p) {
if (p.length() == 0) return Float.NaN;
int fract = p.indexOf('/');
if (fract == -1) return Float.valueOf(p).floatValue();
String q = p.substring(0,fract);
float num = 0;
if (q.length() != 0) num = Integer.parseInt(q);
p = p.substring(fract+1,p.length());
float den = 0;
if (p.length() != 0) den = Integer.parseInt(p);
return num/den;
}
public static int codePointFromHex(String p) {
String temp = Utility.fromHex(p);
if (UTF32.length32(temp) != 1) throw new ChainException("String is not single (UTF32) character: " + p, null);
return UTF32.char32At(temp, 0);
}
public static String fromHex(String p) {
StringBuffer output = new StringBuffer();
int value = 0;
int count = 0;
main:
for (int i = 0; i < p.length(); ++i) {
char ch = p.charAt(i);
int digit = 0;
switch (ch) {
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
digit = ch - 'a' + 10;
break;
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
digit = ch - 'A' + 10;
break;
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
case '8': case '9':
digit = ch - '0';
break;
default:
int type = Character.getType(ch);
if (type != Character.SPACE_SEPARATOR) {
throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"",
new Object[] {String.valueOf(ch), new Integer(i), p});
}
// fall through!!
case ' ': case ',': case ';': // do SPACE here, just for speed
if (count != 0) {
UTF32.append32(output, value);
}
count = 0;
value = 0;
continue main;
}
value <<= 4;
value += digit;
if (value > 0x10FFFF) {
throw new ChainException("Character code too large: '{0}' at position {1} in \"{2}\"",
new Object[] {String.valueOf(ch), new Integer(i), p});
}
count++;
}
if (count != 0) {
UTF32.append32(output, value);
}
return output.toString();
}
public static int split(String s, char divider, String[] output) {
int last = 0;
int current = 0;
int i;
for (i = 0; i < s.length(); ++i) {
if (s.charAt(i) == divider) {
output[current++] = s.substring(last,i);
last = i+1;
}
}
output[current++] = s.substring(last,i);
int result = current;
while (current < output.length) {
output[current++] = "";
}
return result;
}
public static String[] split(String s, char divider) {
String[] result = new String[100];
int count = split(s, divider, result);
return extract(result, 0, count);
}
public static String[] extract(String[] source, int start, int end) {
String[] result = new String[end-start];
System.arraycopy(source, start, result, 0, end - start);
return result;
}
/*
public static String quoteJava(String s) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < s.length(); ++i) {
result.append(quoteJava(s.charAt(i)));
}
return result.toString();
}
*/
public static String quoteJavaString(String s) {
if (s == null) return "null";
StringBuffer result = new StringBuffer();
result.append('"');
for (int i = 0; i < s.length(); ++i) {
result.append(quoteJava(s.charAt(i)));
}
result.append('"');
return result.toString();
}
public static String quoteJava(int c) {
switch (c) {
case '\\':
return "\\\\";
case '"':
return "\\\"";
case '\r':
return "\\r";
case '\n':
return "\\n";
default:
if (c >= 0x20 && c <= 0x7E) {
return String.valueOf((char)c);
} else if (UTF32.isSupplementary(c)) {
return "\\u" + hex((char)UTF32.getLead(c),4) + "\\u" + hex((char)UTF32.getTrail(c),4);
} else {
return "\\u" + hex((char)c,4);
}
}
}
public static String quoteXML(int c) {
switch (c) {
case '<': return "&lt;";
case '>': return "&gt;";
case '&': return "&amp;";
case '\'': return "&apos;";
case '"': return "&quot;";
// fix controls, since XML can't handle
// also do this for 09, 0A, and 0D, so we can see them.
case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x06: case 0x07:
case 0x08: case 0x09: case 0x0A: case 0x0B: case 0x0C: case 0x0D: case 0x0E: case 0x0F:
case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17:
case 0x18: case 0x19: case 0x1A: case 0x1B: case 0x1C: case 0x1D: case 0x1E: case 0x1F:
case 0x7F:
// fix noncharacters, since XML can't handle
case 0xFFFE: case 0xFFFF:
return "#x" + hex(c,1) + ";";
}
// fix surrogates, since XML can't handle
if (UTF32.isSurrogate(c)) {
return "#x" + hex(c,1) + ";";
}
if (c <= 0x7E || UTF8) {
return UTF32.valueOf32(c);
}
// fix supplementaries & high characters, because of IE bug
/*if (UTF32.isSupplementary(c) || 0xFFF9 <= c && c <= 0xFFFD) {
return "#x" + hex(c,1) + ";";
}
*/
return "&#x" + hex(c,1) + ";";
}
public static String quoteXML(String source) {
if (source == null) return "null";
StringBuffer result = new StringBuffer();
for (int i = 0; i < source.length(); ++i) {
int c = UTF32.char32At(source, i);
if (UTF32.isSupplementary(c)) ++i;
result.append(quoteXML(c));
}
return result.toString();
}
public static int compare(char[] a, int aStart, int aEnd, char[] b, int bStart, int bEnd) {
while (aStart < aEnd && bStart < bEnd) {
int diff = a[aStart++] - b[bStart++];
if (diff != 0) return diff;
}
return (aEnd - aStart) - (bEnd - bStart);
}
public static int compare(byte[] a, int aStart, int aEnd, byte[] b, int bStart, int bEnd) {
while (aStart < aEnd && bStart < bEnd) {
int diff = a[aStart++] - b[bStart++];
if (diff != 0) return diff;
}
return (aEnd - aStart) - (bEnd - bStart);
}
public static int compareUnsigned(byte[] a, int aStart, int aEnd, byte[] b, int bStart, int bEnd) {
while (aStart < aEnd && bStart < bEnd) {
int diff = (a[aStart++] & 0xFF) - (b[bStart++] & 0xFF);
if (diff != 0) return diff;
}
return (aEnd - aStart) - (bEnd - bStart);
}
public static String join(int[] array, String sep) {
String result = "{";
for (int i = 0; i < array.length; ++i) {
if (i != 0) result += sep;
result += array[i];
}
return result + "}";
}
public static String join(long[] array, String sep) {
String result = "{";
for (int i = 0; i < array.length; ++i) {
if (i != 0) result += sep;
result += array[i];
}
return result + "}";
}
private static final String[] searchPath = {
"EXTRAS",
"3.1.1",
"3.1.0",
"3.0.1",
"3.0.0",
"2.1.9",
"2.0.0",
"1.1.0",
};
private static final String DATA_DIR = "C:\\DATA";
public static PrintWriter openPrintWriter(String filename) throws IOException {
return new PrintWriter(
new UTF8StreamWriter(new FileOutputStream(DATA_DIR + File.separator + "GEN" + File.separator + filename),
32*1024));
}
public static BufferedReader openUnicodeFile(String filename, String version) throws IOException {
// get all the files in the directory
for (int i = 0; i < searchPath.length; ++i) {
if (version.length() != 0 && version.compareTo(searchPath[i]) < 0) continue;
String directoryName = DATA_DIR + File.separator + searchPath[i] + "-Update" + File.separator;
System.out.println("Trying: '" + directoryName + "'");
File directory = new File(directoryName);
String[] list = directory.list();
for (int j = 0; j < list.length; ++j) {
String fn = list[j];
if (!fn.endsWith(".txt")) continue;
//System.out.print("\t'" + fn + "'");
if (!fn.startsWith(filename)) {
//System.out.println(" -- MISS: '" + filename + "'");
continue;
}
//System.out.println(" -- HIT");
System.out.println("\tFound: '" + fn + "'");
return new BufferedReader(new FileReader(directoryName + fn),32*1024);
}
}
return null;
}
}

View file

@ -0,0 +1,403 @@
package com.ibm.text.utility;
/**
* Very dumb XML parser, designed for restricted environment where transmitter is guaranteed
* to limit types of XML files generated.
*
* RESTRICTIONS
* Requires document to be well-formed. Doesn't properly signal errors if it is not.
* No DTDs, !DOCTYPE, !ATTLIST, !ELEMENT, ![, !NOTATION, !ENTITY, CDATA
* No processing instructions
* Does do character references, lt, gt, amp, apos, quot
* The encoding is specified by the user, by using the right Reader
* On creation, you supply a buffer for the textual elements. Use a buffer that is as large
* as the largest possible piece of text (e.g. attribute value or element text) in the file.
*
* @author Mark Davis
*/
import java.io.*;
public final class XMLParse implements XMLParseTypes {
/** Create a parser.
*/
public XMLParse(Reader stream, char[] buffer) {
this.stream = stream;
this.buffer = buffer;
}
/** Create a parser.
*/
public XMLParse(String fileName, char[] buffer) throws FileNotFoundException {
stream = new BufferedReader(new FileReader(fileName),32*1024);
this.buffer = buffer;
}
/** Get the textual value associated with this item.
* Only valid for ELEMENT_TAG*, ATTRIBUTE*, TEXT.
*/
public String getValue() {
return String.valueOf(buffer, 0, bufferCount);
}
/** Get length of the textual value associated with this item.
* Only valid for ELEMENT_TAG*, ATTRIBUTE*, TEXT.
*/
public int getValueCount() {
return bufferCount;
}
/** Get the buffer that was passed in on creation.
*/
public char[] getValueArray() {
return buffer;
}
/** Get the "kind" of the last item (see XMLParseTypes)
*/
public int getKind() {
return kind;
}
/** Get the next element, returning a "Kind" (see XMLParseTypes)
*/
public byte next() {
char c = '\u0000';
char type = c;
while (c != 0xFFFF) {
try {
// First read the character. If there is a buffered char, use it instead
if (bufferChar != 0) {
c = bufferChar;
bufferChar = 0;
} else {
c = (char) stream.read();
}
// Now set the right type. Since we assume validity, anything but the syntax chars
// can be classed as IDENTIFIER
switch (c) {
case ' ': case '\r': case '\n': case '\t':
type = ' ';
break;
case '<': case '>': case '#': case ';': case '/': case '\'': case '"':
case '=': case '?': case '!': case '-':
type = c;
break;
case '&': // CR, either numerical or lt, gt, quot, amp, apos
// gather characters
int crCount = 0;
while (true) {
c = (char) stream.read();
if (c == ';') break;
crBuffer[crCount++] = c;
}
// parse it, and break into two pieces if necessary
int x = parseCR(crBuffer, crCount);
c = (char)x;
if (x > 0xFFFF) { // Supplementary
x -= 0x10000;
c = (char) (0xD800 + (x >> 10));
bufferChar = (char) (0xDC00 + (x & 0x3FF));
}
// Since we assume validity, any CRs are not syntax characters
type = IDENTIFIER; // everything else
break;
default:
type = IDENTIFIER; // everything else
break;
}
} catch (Exception e) {
c = '\uFFFF';
}
// We now have a character. Throw it at our little state machine
if (SHOW) System.out.println(c + ", " + type + ", " + stateNames[state]);
switch (state) {
case IN_TEXT:
if (type == '<') {
state = START_ELEMENT;
if (bufferCount != 0) {
kind = TEXT;
return kind;
}
break;
}
buffer[bufferCount++] = c;
break;
case START_ELEMENT: // must be either '/' or more than one ID char
bufferCount = 0;
switch (type) {
case '/':
elementType = ELEMENT_TAG_SLASH;
state = IN_ELEMENT;
break;
case '!':
buffer[bufferCount++] = c;
elementType = ELEMENT_TAG_COMMENT;
state = IN_COMMENT;
break;
case '?':
elementType = ELEMENT_TAG_QUESTION;
state = IN_ELEMENT;
break;
default:
elementType = ELEMENT_TAG;
buffer[bufferCount++] = c;
state = IN_ELEMENT;
break;
}
break;
case IN_COMMENT:
buffer[bufferCount++] = c;
if (type == '-') state = IN_COMMENT2;
else state = IN_COMMENT;
break;
case IN_COMMENT2:
buffer[bufferCount++] = c;
if (type == '-') state = IN_COMMENT3;
else state = IN_COMMENT;
break;
case IN_COMMENT3:
if (type == '>') {
kind = ELEMENT_TAG_COMMENT;
bufferChar = c;
state = IN_ATTRIBUTES;
elementType = END_ELEMENT_COMMENT;
return kind;
} else if (type != '-') {
state = IN_COMMENT;
}
buffer[bufferCount++] = c;
break;
case IN_ELEMENT:
if (type != IDENTIFIER) {
state = IN_ATTRIBUTES;
kind = elementType;
elementType = END_ELEMENT;
bufferChar = c;
return kind;
}
buffer[bufferCount++] = c;
break;
case IN_ATTRIBUTES:
bufferCount = 0;
if (type == '/') {
elementType = END_ELEMENT_SLASH;
} else if (type == '?') {
elementType = END_ELEMENT_QUESTION;
} else if (type == '>') {
state = IN_TEXT;
kind = elementType;
return kind;
} else if (type == IDENTIFIER) {
state = IN_ATTR;
buffer[bufferCount++] = c;
break;
}
break;
case IN_ATTR:
if (type != IDENTIFIER) {
state = START_VALUE;
kind = ATTRIBUTE_TAG;
return kind;
}
buffer[bufferCount++] = c;
break;
case START_VALUE: // must have <s>* = ( ' | " )
if (type == '\'' || type == '"') {
lastQuote = c;
state = IN_VALUE;
bufferCount = 0;
}
break;
case IN_VALUE: // only terminated by lastQuote
if (type == lastQuote) {
state = IN_ATTRIBUTES;
kind = ATTRIBUTE_VALUE;
return kind;
}
buffer[bufferCount++] = c;
break;
}
}
return DONE;
}
/** Utility for doing XML quotes. Flags control which characters are handled and how.
* (see XMLParseTypes for values)
*/
public static String quote(int c) {
return quote(c, 0);
}
/** Utility for doing XML quotes. Flags control which characters are handled and how.
* (see XMLParseTypes for values)
*/
public static String quote(int c, int flags) {
String result = quoteGuts(c, flags);
if (result != null) return result;
return String.valueOf((char)c);
}
/** Utility for doing XML quotes. Flags control which characters are handled and how.
* (see XMLParseTypes for values)
*/
public static String quote(String source) {
return quote(source, 0);
}
/** Utility for doing XML quotes. Flags control which characters are handled and how.
* (see XMLParseTypes for values)
*/
public static String quote(String source, int flags) {
StringBuffer result = new StringBuffer();
String temp;
for (int i = 0; i < source.length(); ++i) {
int c = UTF32.char32At(source, i);
if (c > 0xFFFF) ++i;
temp = quoteGuts(c, flags);
if (temp != null) result.append(temp);
else if (c <= 0xFFFF) result.append((char)c);
else result.append(source.substring(i-1,i+1)); // surrogates
}
return result.toString();
}
/** Parses inside of CR. buffer should not contain the initial '&', or final ';'
*/
static int parseCR(char[] crBuffer, int crCount) {
int c;
int start = 0;
if (crCount == 0) return -1;
switch (crBuffer[start++]) {
case 'l': c = '<'; break; // lt
case 'g': c = '>'; break; // gt
case 'q': c = '"'; break; // quot
case 'a': // &amp;, &apos;
if (crCount > start && crBuffer[start] == 'm') c = '&';
else c = '\'';
break;
case '#':
int radix = 10;
if (crCount > start && crBuffer[start] == 'x') {
radix = 16;
++start;
}
// Simple code for now. Could be sped up.
c = Integer.parseInt(String.valueOf(crBuffer,start,crCount-start), radix);
break;
default:
c = -1;
}
return c;
}
/** Utility for doing hex, padding with zeros
*/
static public String hex(long i, int places) {
String result = Long.toString(i, 16).toUpperCase();
if (result.length() < places) {
result = "0000000000000000".substring(result.length(),places) + result;
}
return result;
}
// =================== PRIVATES =================================
private static final char[] buf2 = new char[2];
private static final boolean SHOW = false;
private char[] buffer;
private int bufferCount;
private byte kind = TEXT;
private Reader stream;
private char[] crBuffer = new char[10];
private int state = IN_TEXT;
private byte elementType;
private char lastQuote;
private char bufferChar;
private static final byte IN_TEXT = 0, START_ELEMENT = 1, IN_ELEMENT = 2,
IN_ATTR = 3, START_VALUE = 4, IN_VALUE = 5, IN_ATTRIBUTES = 6,
IN_COMMENT = 7, IN_COMMENT2 = 8, IN_COMMENT3 = 9;
private static final String[] stateNames = {"IN_TEXT", "START_ELEMENT", "IN_ELEMENT",
"IN_ATTR", "START_VALUE", "IN_VALUE", "IN_ATTRIBUTES",
"IN_COMMENT", "IN_COMMENT2", "IN_COMMENT3"};
private static final char IDENTIFIER = 'a';
private static String quoteGuts(int c, int flags) {
String prefix = "&";
switch (c) {
case '<': return "&lt;";
case '>': return "&gt;";
case '&': return "&amp;";
case '\'': return "&apos;";
case '"': return "&quot;";
// Optionally fix TAB, CR, LF
case 0x09: case 0x0A: case 0x0D:
if ((flags & QUOTE_TABCRLF) == 0) return null;
break;
// Fix controls, non-characters, since XML can't handle
case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x06: case 0x07:
case 0x08: case 0x0B: case 0x0C: case 0x0E: case 0x0F:
case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17:
case 0x18: case 0x19: case 0x1A: case 0x1B: case 0x1C: case 0x1D: case 0x1E: case 0x1F:
case 0x7F:
case 0xFFFE: case 0xFFFF:
prefix = "";
break;
// Optionally fix IE Bug characters
case 0xFF00: case 0xFF01: case 0xFF02: case 0xFF03: case 0xFF04: case 0xFF05: case 0xFF06: case 0xFF07:
case 0xFFF8: case 0xFFF9: case 0xFFFA: case 0xFFFB: case 0xFFFC: case 0xFFFD:
if ((flags & QUOTE_IEBUG) == 0) return null;
prefix = "";
break;
default:
if (c <= 0x7E) { // don't quote other ASCII
if ((flags & QUOTE_ASCII) == 0) return null;
} else if (0xD800 <= c && c <= 0xDFFF) {// fix surrogates, since XML can't handle
prefix = "";
} else if (c > 0xFFFF && (flags & QUOTE_IEBUG) != 0) {
prefix = "";
} else if ((flags & QUOTE_NON_ASCII) == 0) {
return null;
}
break;
}
if ((flags & QUOTE_DECIMAL) == 0) {
return prefix + "#x" + hex(c,1) + ";";
} else {
return prefix + "#" + Integer.toString(c) + ";";
}
}
}

View file

@ -0,0 +1,35 @@
package com.ibm.text.utility;
/** Interface of values for use with XMLParse.
* Others classes can "implements" this also, to avoid typing XMLParseTypes.XXX
*/
public interface XMLParseTypes {
/** Kind values, for XMLParse.getKind(), next()
*/
public static final byte
DONE = 0,
ELEMENT_TAG = 1, ELEMENT_TAG_SLASH = 2, ELEMENT_TAG_COMMENT = 3, ELEMENT_TAG_QUESTION = 4,
END_ELEMENT = 5, END_ELEMENT_SLASH = 6, END_ELEMENT_COMMENT = 7, END_ELEMENT_QUESTION = 8,
ATTRIBUTE_TAG = 9, ATTRIBUTE_VALUE = 10,
TEXT = 11;
/** Flag masks for XMLParse.quote(x, flags). Use '|' to combine
*/
public static final byte
QUOTE_NON_ASCII = 1,
QUOTE_ASCII = 2,
QUOTE_IEBUG = 4,
QUOTE_TABCRLF = 8,
QUOTE_DECIMAL = 16;
/** For Debugging
*/
static final String[] kindNames = {
"DONE",
"ELEMENT_TAG", "ELEMENT_TAG_SLASH", "ELEMENT_TAG_COMMENT", "ELEMENT_TAG_QUESTION",
"END_ELEMENT", "END_ELEMENT_SLASH", "END_ELEMENT_COMMENT", "END_ELEMENT_QUESTION",
"ATTRIBUTE_TAG", "ATTRIBUTE_VALUE",
"TEXT",
};
}

View file

@ -0,0 +1,336 @@
package com.ibm.text.utility;
/** Simple Test program for XMLParse
*/
import java.io.*;
import java.util.*;
public class testParser implements XMLParseTypes {
public static final String BASE_DIR = "C:\\Documents and Settings\\Davis\\My Documents\\UnicodeData\\UNIDATA 3.0.1\\";
public static final boolean VERBOSE = false;
private static final String testFile = BASE_DIR + "UCD-Main.xml"; // "test.xml"; // BASE_DIR + "UCD-Main.xml";
public static void main (String[] args) throws Exception {
//test1();
//test2();
test3();
}
public static void test1() throws Exception {
XMLParse xml = new XMLParse(testFile, new char[1000]);
for (int i = 0; i < 100000; ++i) {
byte kind = xml.next();
if (kind == DONE) break;
String value = xml.getValue();
int quoteFlags = QUOTE_IEBUG | QUOTE_NON_ASCII | (kind != TEXT ? QUOTE_TABCRLF : 0);
String qValue = XMLParse.quote(value, quoteFlags);
if (VERBOSE) System.out.println(kindNames[kind] + ", \"" + value + "\", \"" + qValue + "\"");
else {
switch (kind) {
case ELEMENT_TAG: System.out.print('<' + qValue); break;
case ELEMENT_TAG_SLASH: System.out.print("</" + qValue); break;
case ELEMENT_TAG_COMMENT: System.out.print("<" + qValue); break;
case ELEMENT_TAG_QUESTION: System.out.print("<?" + qValue); break;
case END_ELEMENT: System.out.print(">"); break;
case END_ELEMENT_COMMENT: System.out.print(">"); break;
case END_ELEMENT_SLASH: System.out.print("/>"); break;
case END_ELEMENT_QUESTION: System.out.print("?>"); break;
case ATTRIBUTE_TAG: System.out.print(" " + qValue + "="); break;
case ATTRIBUTE_VALUE: System.out.print("\"" + qValue + "\""); break;
case TEXT: System.out.print(qValue); break;
default: throw new Exception("Unknown KIND");
}
}
}
}
static final int NORMAL_QUOTE = QUOTE_NON_ASCII | QUOTE_IEBUG | QUOTE_TABCRLF;
static void test2() throws Exception {
PrintWriter log = Utility.openPrintWriter("UCD-Extract.html");
//int fieldCount = 4;
//int width = 100/fieldCount;
//int first = width + 100 - width*fieldCount;
try {
log.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
log.println("<style><!--");
log.println("th { background-color: #99FFFF; text-align: Left; font-style: italic; font-weight: bold }");
log.println("table { page-break-after: always }");
log.println("--></style>");
log.println("<title>Extract from UCD</title>");
log.println("</head><body>");
String tableHead = "<table border='1' width='100%' cellpadding='4'><tr>"
+ "<th width='20'>Code</th>"
+ "<th width='20'>Char</th>"
+ "<th width='20'>GC</th>"
+ "<th width='50%'>Props</th>"
+ "<th width='50%'>Name</th></tr></tr>";
log.println(tableHead);
XMLParse xml = new XMLParse(BASE_DIR + "UCD-Main.xml", new char[1000]);
boolean recordingChar = false;
int topByte = 0;
int printByte = 0;
Map data = new TreeMap();
String lastTag = "";
for (int line = 0; ; ++line) {
byte kind = xml.next();
if (kind == DONE) break;
String value = xml.getValue();
switch (kind) {
case ELEMENT_TAG:
recordingChar = value.equals("e");
break;
case ATTRIBUTE_TAG:
if (!recordingChar) break;
lastTag = value;
break;
case ATTRIBUTE_VALUE:
if (!recordingChar) break;
data.put(lastTag, value);
break;
case END_ELEMENT:
case END_ELEMENT_SLASH:
if (!recordingChar) break;
recordingChar = false;
// get data
String ch = (String)data.get("c");
ch = fixHack(ch);
String name = (String)data.get("n");
if (name == null) name = "<computed>";
String props = (String)data.get("xs");
if (props == null) props = "\u00A0";
String gc = (String)data.get("gc");
if (gc == null) gc = "Lo";
// split tables
int code = UTF32.char32At(ch, 0);
if ((topByte & ~0x1F) != (code & ~0x1F)) {
log.println("</table><br>");
log.println(tableHead);
topByte = code;
if ((printByte & ~0xFF) != (code & ~0xFF)) {
System.out.println("Printing table for " + XMLParse.hex(topByte,2));
printByte = code;
}
}
// draw line
log.println("<tr><td>" + XMLParse.hex(code,4) +
"</td><td>" + XMLParse.quote(ch,NORMAL_QUOTE) +
"</td><td>" + XMLParse.quote(gc,NORMAL_QUOTE) +
"</td><td>" + XMLParse.quote(props,NORMAL_QUOTE) +
"</td><td>" + XMLParse.quote(name,NORMAL_QUOTE) + "</td></tr>");
// clear storage
data.clear();
break;
}
}
log.println("</table></body></html>");
} finally {
log.close();
}
}
static void test3() throws Exception {
PrintWriter log = new PrintWriter(new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(BASE_DIR + "CaseFoldingDraft3.txt"),
"UTF8"),
32*1024));
try {
collect(log, "Other_Math");
collect (log, "Other_Alphabetic");
collect (log, "Other_Composite");
//int fieldCount = 4;
//int width = 100/fieldCount;
//int first = width + 100 - width*fieldCount;
} finally {
log.close();
}
}
static final void collect(PrintWriter log, String prop) throws Exception {
XMLParse xml = new XMLParse(BASE_DIR + "UCD-Main.xml", new char[1000]);
//boolean recordingChar = false;
//int topByte = 0;
//int printByte = 0;
//Map data = new TreeMap();
String lastTag = "";
String lastChar = "";
String lastName = "";
String lastCat = "";
int startChar = -1;
int endChar = -2;
String startName = "";
String startCat = "";
for (int line = 0; ; ++line) {
if ((line % 10000) == 0) System.err.println("Item " + line);
byte kind = xml.next();
if (kind == DONE) break;
String value = xml.getValue();
switch (kind) {
case ATTRIBUTE_TAG:
lastTag = value;
break;
case ATTRIBUTE_VALUE:
if (lastTag.equals("c")) lastChar = value;
else if (lastTag.equals("n")) lastName = value;
else if (lastTag.equals("gc")) lastCat = value;
else if (lastTag.equals("xs") && value.indexOf(prop) >= 0) {
lastChar = fixHack(lastChar);
int ch = UTF32.char32At(lastChar,0);
if (ch == endChar + 1) endChar = ch;
else {
//FDD0; FDEF; Noncharacter_Code_Point; # XX; 32;
if (endChar >= 0) log.println(Utility.hex(startChar, 4) + "; "
+ (endChar == startChar ? " " : Utility.hex(endChar, 4))
+ "; " + prop
+ "; # " + startCat
+ "; " + (endChar-startChar+1)
+ "; " + startName
+ (endChar == startChar ? "" : "..."));
startChar = endChar = ch;
startName = lastName;
startCat = lastCat;
}
}
break;
}
}
if (endChar >= 0) log.println(Utility.hex(startChar, 4) + "; "
+ (endChar == startChar ? " " : Utility.hex(endChar, 4))
+ "; " + prop
+ "; # " + startCat
+ "; " + (endChar-startChar+1)
+ "; " + startName
+ (endChar == startChar ? "" : "..."));
}
static void test4() throws Exception {
PrintWriter log = new PrintWriter(new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(BASE_DIR + "CaseFoldingDraft3.txt"),
"UTF8"),
32*1024));
//int fieldCount = 4;
//int width = 100/fieldCount;
//int first = width + 100 - width*fieldCount;
try {
XMLParse xml = new XMLParse(BASE_DIR + "UCD-Main.xml", new char[1000]);
boolean recordingChar = false;
//int topByte = 0;
//int printByte = 0;
Map data = new TreeMap();
String lastTag = "";
for (int line = 0; ; ++line) {
if ((line % 10000) == 0) System.err.println("Item " + line);
byte kind = xml.next();
if (kind == DONE) break;
String value = xml.getValue();
switch (kind) {
case ELEMENT_TAG:
recordingChar = value.equals("e");
break;
case ATTRIBUTE_TAG:
if (!recordingChar) break;
lastTag = value;
break;
case ATTRIBUTE_VALUE:
if (!recordingChar) break;
data.put(lastTag, value);
break;
case END_ELEMENT:
case END_ELEMENT_SLASH:
if (!recordingChar) break;
recordingChar = false;
// get data
String ch = (String)data.get("c");
ch = fixHack(ch);
String name = (String)data.get("n");
if (name == null) name = "<computed>";
String lc = (String)data.get("lc");
if (lc == null) lc = ch;
String fc = (String)data.get("fc");
if (fc == null) fc = (String)data.get("sl");
if (fc == null) fc = lc;
if (fc.equals(ch)) continue;
if (fc.length() == 1) {
log.println(Utility.hex(ch, " ") + "; C; " + Utility.hex(fc, " ") + "; # " + name);
} else {
log.println(Utility.hex(ch, " ") + "; F; " + Utility.hex(fc, " ") + "; # " + name);
if (!lc.equals(ch)) {
log.println(Utility.hex(ch, " ") + "; S; " + Utility.hex(lc, " ") + "; # " + name);
}
}
// clear storage
data.clear();
break;
}
}
} finally {
log.close();
}
}
static final String fixHack(String s) {
StringBuffer result = new StringBuffer();
char last = '\u0000';
int position = -1;
for (int i = 0; i < s.length(); ++i) {
char c = s.charAt(i);
if (position > 0) {
if (c == ';') {
int x = Integer.parseInt(s.substring(position,i),16);
result.append(UTF32.valueOf32(x));
position = -1;
}
} else {
if (last == '#' && c == 'x') {
result.setLength(result.length()-1); // remove '#'
position = i+1;
} else {
result.append(c);
}
}
last = c;
}
if (result != null) return result.toString();
return s;
}
}