Forgot copyrights

X-SVN-Rev: 5643
This commit is contained in:
Mark Davis 2001-08-31 00:30:17 +00:00
parent 4c3e3b8dff
commit 7260c9a6a4
20 changed files with 1310 additions and 1050 deletions

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/BuildNames.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.IOException;
@ -10,35 +23,35 @@ import com.ibm.text.utility.*;
public class BuildNames implements UCD_Types {
static final boolean DEBUG = true;
static UCD ucd;
public static void main(String[] args) throws IOException {
ucd = UCD.make();
collectWords();
}
static Set words = new TreeSet(new LengthFirstComparator());
static Set lines = new TreeSet(new LengthFirstComparator());
static int[] letters = new int[128];
static void stash(String word) {
words.add(word);
for (int i = 0; i < word.length(); ++i) {
letters[word.charAt(i)]++;
}
}
static String transform(String line) {
StringBuffer result = new StringBuffer();
boolean changed = false;
for (int i = 0; i < line.length(); ++i) {
char c = line.charAt(i);
if (c == '-' || c == '<' || c == '>') {
if (result.length() > 0 && result.charAt(result.length()-1) != ' ') result.append(' ');
result.append(c);
@ -46,7 +59,7 @@ public class BuildNames implements UCD_Types {
changed = true;
continue;
}
if ('a' <= c && c <= 'z') {
result.append((char)(c - 'a' + 'A'));
changed = true;
@ -56,15 +69,15 @@ public class BuildNames implements UCD_Types {
result.append('*').append((char)(c - '0' + 'A'));
changed = true;
continue;
}
}
result.append(c);
}
if (!changed) return line;
return result.toString().trim();
}
static void collectWords() throws IOException {
System.out.println("Gathering data");
//Counter counter = new Counter();
String[] parts = new String[100];
@ -74,23 +87,23 @@ public class BuildNames implements UCD_Types {
for (int i = 0; i < 0x10FFFF; ++i) {
if (ucd.hasComputableName(i)) continue;
String name = transform(ucd.getName(i));
sum += name.length();
used++;
// replace numbers & letters
int len = Utility.split(name, ' ', parts);
for (int j = 0; j < len; ++j) {
stash(parts[j]);
}
lines.add(name);
}
System.out.println("Overhead: " + (lastLink - used) + ", " + ((lastLink - used) * 100 / used) + "%");
System.out.println("Strings: " + sum + ", " + (lastLink*4));
System.out.println();
System.out.println("Compacting Words");
System.out.println();
@ -104,7 +117,7 @@ public class BuildNames implements UCD_Types {
if (false || !goesRound) System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")"
+ (goesRound ? ": NO RT: '" + round + "'" : ""));
}
System.out.println();
System.out.println("Compacting Lines");
System.out.println();
@ -122,18 +135,18 @@ public class BuildNames implements UCD_Types {
if (false || !goesRound) System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")"
+ (!goesRound ? ": NO RT: '" + round + "'" : ""));
}
/*System.out.println("Printing Compact Forms");
for (int i = 0; i < CompactName.lastToken; ++i) {
String s = CompactName.stringFromToken(i);
System.out.println(i + ": '" + s + "'");
}*/
System.out.println("Strings: " + sum
+ ", " + (CompactName.spacedMinimum*4)
+ ", " + (CompactName.lastToken*4)
);
}
/*
Set stuff = new TreeSet();
@ -142,7 +155,7 @@ public class BuildNames implements UCD_Types {
stuff.add(new Integer((letters[i] << 8) + i));
}
}
it = stuff.iterator();
while (it.hasNext()) {
int in = ((Integer) it.next()).intValue();
@ -153,13 +166,13 @@ public class BuildNames implements UCD_Types {
System.out.println("\tNo Round Trip: '" + rname + "'");
}
*/
static Map stringToInt = new HashMap();
static Map intToString = new HashMap();
static final int[] remap = new int['Z'+1];
static final int maxToken;
static {
int counter = 1;
remap[' '] = counter++;
@ -174,7 +187,7 @@ public class BuildNames implements UCD_Types {
}
maxToken = counter;
}
static final String[] unmap = new String[maxToken];
static {
unmap[0] = "";
@ -183,16 +196,16 @@ public class BuildNames implements UCD_Types {
if (x != 0) unmap[x] = String.valueOf((char)i);
}
}
static int[] links = new int[40000];
static final int linkStart = 0;
static int lastLink = 0;
static final int LITERAL_BOUND = 0x7FFF - maxToken * maxToken;
static boolean isLiteral(int i) {
return (i & 0x7FFF) > LITERAL_BOUND;
}
static String lookup(int i) {
String result;
boolean trailingSpace = false;
@ -216,7 +229,7 @@ public class BuildNames implements UCD_Types {
if (DEBUG) System.out.println("token: " + i + " => '" + result + "'");
return result;
}
static int getInt(String s) {
if (s.length() < 3) {
if (s.length() == 0) return 0;
@ -228,14 +241,14 @@ public class BuildNames implements UCD_Types {
if (in == null) return -1;
return ((Integer)in).intValue();
}
static int putString(String s, int lead, int trail) {
Object in = stringToInt.get(s);
if (in != null) throw new IllegalArgumentException();
int value = (lead << 16) + (trail & 0xFFFF);
int result = lastLink;
links[lastLink++] = value;
if (DEBUG) {
System.out.println("'" + s + "', link[" + result + "] = lead: " + lead + ", trail: " + trail);
String roundTrip = lookup(result);
@ -246,7 +259,7 @@ public class BuildNames implements UCD_Types {
stringToInt.put(s, new Integer(result));
return result;
}
// s cannot have a trailing space. Must be <,>,-,SPACE,0-9,A-Z
static int addString(String s) {
int result = getInt(s);
@ -259,9 +272,9 @@ public class BuildNames implements UCD_Types {
int lastSpace = -1;
int spaceBits;
int endOfFirst;
// invariant. We break after a space if there is one.
for (int i = 1; i < limit; ++i) {
char c = s.charAt(i-1);
spaceBits = 0;
@ -271,7 +284,7 @@ public class BuildNames implements UCD_Types {
endOfFirst--;
spaceBits = 0x8000;
}
String firstPart = s.substring(0, endOfFirst);
String lastPart = s.substring(i);
if (firstPart.equals("<START OF ")) {
@ -292,7 +305,7 @@ public class BuildNames implements UCD_Types {
if (i > bestSpaceLen && c == ' ') {
bestSpaceLen = i;
bestSpace_i = i + 1;
}
}
}
int end_i = s.length() - i;
if (!isLiteral(trail)) {
@ -310,9 +323,9 @@ public class BuildNames implements UCD_Types {
bestLen = bestSpaceLen;
best_i = bestSpace_i;
}
spaceBits = 0;
if (bestLen > 0) { // if one matches, recurse -- and return pair
endOfFirst = best_i;
if (lastSpace > 0) {
@ -335,8 +348,8 @@ public class BuildNames implements UCD_Types {
}
// otherwise, we failed to find anything. Then break before the last word, if there is one
// otherwise break in the middle (but at even value)
if (lastSpace >= 0) {
best_i = lastSpace;
endOfFirst = lastSpace - 1;
@ -350,7 +363,7 @@ public class BuildNames implements UCD_Types {
+ "' # '" + lastPart + "' FALLBACK");
return putString(s, spaceBits | addString(firstPart), addString(lastPart));
}
/*
static int addCompression(String s) {
Object in = stringToInt.get(s);
@ -363,7 +376,7 @@ public class BuildNames implements UCD_Types {
if (c == ' ' || c == '-') {
Object pos1 = stringToInt.get(s.substring(0,i+1));
//Object pos23 = stringToInt.get(s..substring(i));
if (pos2 >= 0 && pos3 >= 0) {
fullToCompressed.put(value, new Integer(index + reserved));
@ -381,11 +394,11 @@ public class BuildNames implements UCD_Types {
}
}
}
}
}
}
static void gatherData() throws IOException {
System.out.println("Gathering data");
Counter counter = new Counter();
@ -415,29 +428,29 @@ public class BuildNames implements UCD_Types {
}
}
}
System.out.println("Sorting data");
Map m = counter.extract();
System.out.println("Printing data");
PrintWriter log = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(GEN_DIR + "NameCompression.txt")),
32*1024));
log.println("total: " + total);
Iterator it = m.keySet().iterator();
String mondo = "";
int i = 0;
int strTotal = 0;
int index = 0;
Map fullToCompressed = new HashMap();
String mondoIndex = "";
main:
@ -448,20 +461,20 @@ public class BuildNames implements UCD_Types {
String value = (String)m.get(key);
log.println(i++ + ": " + key + ": \"" + value + "\"");
strTotal += value.length();
// first 128 are the highest frequency, inc. space
if (index < 128 - SINGLES) {
mondo += value;
fullToCompressed.put(value, new String((char)(index + reserved)));
continue;
}
int pos = mondo.indexOf(value);
if (pos >= 0) {
// try splitting!
int bestBreak = -1;
boolean pickFirst = false;
if (value.length() > 2) for (int k = 1; k < value.length()-1; ++k) {
@ -493,22 +506,22 @@ public class BuildNames implements UCD_Types {
mondo += value;
}
}
// high bit on, means 2 bytes, look in array
}
log.println("strTotal: " + strTotal);
log.println("mondo: " + mondo.length());
int k = 80;
for (; k < mondo.length(); k += 80) {
log.println(mondo.substring(k-80, k));
}
log.println(mondo.substring(k-80)); // last line
log.close();
}
static int indexOf(StringBuffer target, String source) {
int targetLen = target.length() - source.length();
main:
@ -520,10 +533,10 @@ public class BuildNames implements UCD_Types {
}
return -1;
}
static final int SINGLES = 26 + 10 + 2;
*/
/*
static String decode(int x) {
if (x < SINGLES) {
@ -533,6 +546,6 @@ public class BuildNames implements UCD_Types {
return " ";
}
if (x < binaryLimit) {
x =
x =
*/
}

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CompactName.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.IOException;
@ -6,28 +19,28 @@ import java.io.*;
import java.text.*;
public class CompactName {
static final boolean DEBUG = false;
public static void main(String[] args) throws IOException {
int test = tokenFromString("ABZ");
String ss = stringFromToken(test);
System.out.println(ss);
CompactName.addWord("ABSOLUTEISM");
for (int i = 0; i < CompactName.lastToken; ++i) {
String s = CompactName.stringFromToken(i);
System.out.println(s);
}
}
static final char[] compactMap = new char[128];
static final char[] compactUnmap = new char[128];
static {
char counter = 0;
compactMap[0] = counter++;
@ -38,14 +51,14 @@ public class CompactName {
compactMap['>'] = counter++;
compactMap['<'] = counter++;
compactMap['*'] = counter++;
compactUnmap[0] = 0;
for (char i = 0; i < compactUnmap.length; ++i) {
int x = compactMap[i];
if (x != 0) compactUnmap[x] = i;
}
}
/*
static String expand(String s) {
StringBuffer result = new StringBuffer();
@ -58,7 +71,7 @@ public class CompactName {
}
return result.toString();
}
static String compact(String s) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < s.length(); ++i) {
@ -72,27 +85,27 @@ public class CompactName {
return result.toString();
}
*/
static Map string_token = new HashMap();
static Map token_string = new HashMap();
static int[] tokenList = new int[40000];
static final int tokenStart = 0;
static int lastToken = 0;
static int spacedMinimum = Integer.MAX_VALUE;
static boolean isLiteral(int i) {
return (i & 0x8000) != 0;
}
static int addTokenForString(String s, int lead, int trail) {
Object in = string_token.get(s);
if (in != null) throw new IllegalArgumentException();
int value = (lead << 16) + (trail & 0xFFFF);
int result = lastToken;
tokenList[lastToken++] = value;
if (DEBUG) {
System.out.println("'" + s + "', tokenList[" + result + "] = lead: " + lead + ", trail: " + trail);
String roundTrip = stringFromToken(result);
@ -103,7 +116,7 @@ public class CompactName {
string_token.put(s, new Integer(result));
return result;
}
static String stringFromToken(int i) {
String result;
if ((i & 0x8000) != 0) {
@ -125,7 +138,7 @@ public class CompactName {
if (DEBUG) System.out.println("token: " + i + " => '" + result + "'");
return result;
}
static int tokenFromString(String s) {
if (s.length() <= 3) {
int first = compactMap[s.charAt(0)];
@ -137,17 +150,17 @@ public class CompactName {
if (in == null) return -1;
return ((Integer)in).intValue();
}
static int addWord(String s) {
int result = tokenFromString(s);
if (result != -1) return result;
int bestLen = 0;
int best_i = 0;
int limit = s.length() - 1;
for (int i = limit; i >= 1; --i) {
String firstPart = s.substring(0, i);
@ -155,7 +168,7 @@ public class CompactName {
int lead = tokenFromString(firstPart);
int trail = tokenFromString(lastPart);
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
if (DEBUG) show(s, firstPart, lastPart, "MATCH BOTH");
return addTokenForString(s, lead, trail);
@ -187,34 +200,34 @@ public class CompactName {
return addTokenForString(s, addWord(firstPart), trail);
}
}
// break at multiple of 3
best_i = ((s.length() + 1) / 6) * 3;
String firstPart = s.substring(0, best_i);
String lastPart = s.substring(best_i);
if (DEBUG) show(s, firstPart, lastPart, "Fallback");
return addTokenForString(s, addWord(firstPart), addWord(lastPart));
}
static void show(String s, String firstPart, String lastPart, String comment) {
System.out.println((s) + " => '" + (firstPart)
+ "' # '" + (lastPart) + "' " + comment);
}
static void startLines() {
spacedMinimum = lastToken;
}
static int addLine(String s) {
int result = tokenFromString(s);
if (result != -1) return result;
int bestLen = 0;
int best_i = 0;
int limit = s.length() - 2;
for (int i = limit; i >= 1; --i) {
char c = s.charAt(i);
if (c != ' ') continue;
@ -224,7 +237,7 @@ public class CompactName {
int lead = tokenFromString(firstPart);
int trail = tokenFromString(lastPart);
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
if (DEBUG) show(s, firstPart, lastPart, "MATCH BOTH");
return addTokenForString(s, lead, trail);
@ -253,7 +266,7 @@ public class CompactName {
return addTokenForString(s, addLine(firstPart), trail);
}
}
System.out.println("SHOULD HAVE MATCHED!!");
throw new IllegalArgumentException("SHOULD HAVE MATCHED!! " + s);
}

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
@ -7,20 +20,20 @@ import java.text.NumberFormat;
import java.io.*;
/** Simple program to merge UCD files into XML. Not yet documented!!
/** Simple program to merge UCD files into XML. Not yet documented!!
* @author Mark Davis
*/
public final class ConvertUCD implements UCD_Types {
public static final boolean SHOW = true;
public static final boolean DEBUG = false;
public static int major;
public static int minor;
public static int update;
static String version;
// varies by version
/*
public static final String BASE_DIR11 = DATA_DIR + "\\Versions\\";
@ -29,10 +42,10 @@ public final class ConvertUCD implements UCD_Types {
public static final String BASE_DIR30 = DATA_DIR + "\\Update 3.0.1\\";
public static final String BASE_DIR31 = DATA_DIR + "\\3.1-Update\\";
*/
//public static final String blocksnamePlain = "Blocks.txt";
//public static final String blocksname31 = "Blocks-4d2.beta";
/** First item is file name, rest are field names (skipping character).
* "OMIT" is special -- means don't record
*/
@ -47,10 +60,10 @@ public final class ConvertUCD implements UCD_Types {
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
{"UnicodeData", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
{"ExtraProperties", "xp"},
{"PropList", "binary"},
{"PropList", "binary"},
//{"ExtraProperties", "xp"},
{"EastAsianWidth", "ea", "OMIT"},
{"LineBreak", "lb", "OMIT"},
{"SpecialCasing", "*sl", "*st", "*su", "sc"},
@ -76,10 +89,10 @@ public final class ConvertUCD implements UCD_Types {
// 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; <compat> 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
{"UnicodeData-3.1.0d8.beta", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
{"PropList-3.1.0d5.beta", "binary"},
{"PropList-3.1.0d5.beta", "binary"},
{"ExtraProperties", "xp"},
{"EastAsianWidth-4d7.beta", "ea", "OMIT"},
{"LineBreak-6d6.beta", "lb", "OMIT"},
{"SpecialCasing-4d1.beta", "*sl", "*st", "*su", "sc"},
@ -98,13 +111,13 @@ public final class ConvertUCD implements UCD_Types {
/*
{"UnicodeData-3.1.0d8.beta", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
{"ExtraProperties", "xp"},
{"EastAsianWidth-4d7.beta", "ea", "OMIT"},
{"LineBreak-6d6.beta", "lb", "OMIT"},
{"SpecialCasing-4d1.beta", "*sl", "*st", "*su", "sc"},
{"CompositionExclusions-3d6.beta", "ce"},
{"CaseFolding-3d4.beta", "OMIT", "*fc"},
{"PropList-3.1.0d2.beta", "PROP", "OMIT"},
{"PropList-3.1.0d2.beta", "PROP", "OMIT"},
{"ArabicShaping", "OMIT", "jt", "jg"},
{"BidiMirroring", "*bg"},
{"Scripts-1d4", "sn"},
@ -114,9 +127,9 @@ public final class ConvertUCD implements UCD_Types {
/*
{"Jamo", "jn"},
//
//"NamesList-3.1.0d1.beta"
static String[][] labelList30 = {
// Labels for the incoming files. Labels MUST match field order in file.
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
@ -133,28 +146,28 @@ public final class ConvertUCD implements UCD_Types {
{"BidiMirroring", "*bg"},
/*
{"Jamo", "jn"},
{"PropList.alpha", "RANGE", "OMIT"},
{"PropList.alpha", "RANGE", "OMIT"},
//
};
static String[][] labelList11 = {
{"UnicodeData-1.1", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
};
static String[][] labelList20 = {
{"UnicodeData-2.0", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
};
static String[][] labelList21 = {
{"UnicodeData-2.1", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
};
*/
// handles
public static final String blocksname = "Blocks";
//public static final String[][] labelList;
public static final boolean NEWPROPS = true;
/*
static {
switch (major*10 + minor) {
@ -180,23 +193,23 @@ public final class ConvertUCD implements UCD_Types {
break;
}
}
*/
static final String dataFilePrefix = "UCD_Data";
// MAIN!!
public static void main (String[] args) throws Exception {
System.out.println("ConvertUCD");
log = new PrintWriter(new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(GEN_DIR + "UCD-log.txt"),
"UTF8"),
32*1024));
log.write("\uFEFF"); // BOM
try {
for (int i = 0; i < args.length; ++i) {
version = args[i];
@ -206,14 +219,14 @@ public final class ConvertUCD implements UCD_Types {
major = Integer.parseInt(parts[0]);
minor = Integer.parseInt(parts[1]);
update = Integer.parseInt(parts[2]);
toJava();
}
} finally {
log.close();
}
}
/*
static void toXML() throws Exception {
// Blocks is special
@ -228,7 +241,7 @@ public final class ConvertUCD implements UCD_Types {
writeXML();
}
*/
static void toJava() throws Exception {
// Blocks is special
// Unihan is special
@ -239,7 +252,7 @@ public final class ConvertUCD implements UCD_Types {
} else {
readSemi(labelList[0]); // TESTING ONLY
}
Iterator it = charData.keySet().iterator();
while (it.hasNext()) {
Object key = it.next();
@ -253,40 +266,40 @@ public final class ConvertUCD implements UCD_Types {
writeJavaData();
}
static PrintWriter log;
//static String directory = BASE_DIR;
//static Map appendDuplicates = new HashMap();
/** First item in labels is file name, rest are field names (skipping character).
* "OMIT" is special -- means don't record
*/
static HashMap isHex = new HashMap();
static HashMap defaults = new HashMap();
static {
for (int j = 0; j < labelList.length; ++j) {
String[] labels = labelList[j];
for (int i = 1; i < labels.length; ++i) {
boolean hex = false;
String def = null;
//char appendChar = '\u0000';
// pull off "*": hex interpretation
if (labels[i].charAt(0) == '*') { // HEX value
hex = true;
labels[i] = labels[i].substring(1);
}
/*
// pull off "$": append duplicates
if (labels[i].charAt(0) == '$') { // HEX value
appendChar = labels[i].charAt(1);
labels[i] = labels[i].substring(2);
}
// pull off default values
int pos = labels[i].indexOf('-');
if (pos >= 0) {
@ -296,16 +309,16 @@ public final class ConvertUCD implements UCD_Types {
*/
// store results
// we do this after all processing, so that the label is clean!!
if (hex) isHex.put(labels[i], "");
//if (appendChar != 0) appendDuplicates.put(labels[i], String.valueOf(appendChar));
defaults.put(labels[i], def);
}
}
}
static List blockData = new LinkedList();
static void readBlocks() throws Exception {
System.out.println("Reading 'Blocks'");
BufferedReader input = Utility.openUnicodeFile(blocksname, version);
@ -316,7 +329,7 @@ public final class ConvertUCD implements UCD_Types {
line = input.readLine();
if (line == null) break;
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
//String original = line;
String comment = "";
int commentPos = line.indexOf('#');
@ -326,12 +339,12 @@ public final class ConvertUCD implements UCD_Types {
}
line = line.trim();
if (line.length() == 0) continue;
int count = Utility.split(line,';',parts);
if (count != 3) throw new ChainException("Bad count in Blocks", null);
blockData.add(new String[] {Utility.fromHex(parts[0]), Utility.fromHex(parts[1]), parts[2].trim()});
}
} catch (Exception e) {
System.out.println("Exception at: " + line);
throw e;
@ -339,9 +352,9 @@ public final class ConvertUCD implements UCD_Types {
input.close();
}
}
static Set properties = new TreeSet();
static void readSemi(String[] labels) throws Exception {
System.out.println();
System.out.println("Reading '" + labels[0] + "'");
@ -361,14 +374,14 @@ public final class ConvertUCD implements UCD_Types {
boolean showedSemi = false;
boolean showedShort = false;
String line = "";
try {
String[] parts = new String[20];
for (int lineNumber = 1; ; ++lineNumber) {
line = input.readLine();
if (line == null) break;
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
String original = line;
String comment = "";
int commentPos = line.indexOf('#');
@ -378,15 +391,15 @@ public final class ConvertUCD implements UCD_Types {
}
line = line.trim();
if (line.length() == 0) continue;
int count = Utility.split(line,';',parts);
if (parts[0].equals("2801")) {
System.out.println("debug?");
}
// fix malformed or simple lists.
if (count != labels.length) {
if (count == labels.length + 1 && parts[count-1].equals("")) {
if (!showedSemi) System.out.println("Extra semicolon in: " + original);
@ -401,11 +414,11 @@ public final class ConvertUCD implements UCD_Types {
parts[i] = "";
}
} else {
throw new ChainException("wrong count: {0}",
throw new ChainException("wrong count: {0}",
new Object[] {new Integer(line), new Integer(count)});
}
}
// store char
// first field is always character OR range. May be UTF-32
int cpTop;
@ -420,9 +433,9 @@ public final class ConvertUCD implements UCD_Types {
cpTop = cpStart;
if (labels[1].equals("RANGE")) UTF32.char32At(Utility.fromHex(parts[1]),0);
}
// properties first
if (labels[1].equals("PROP")) {
String prop = parts[2].trim();
@ -436,7 +449,7 @@ public final class ConvertUCD implements UCD_Types {
properties.add(prop);
if (Utility.find(prop, UCD_Names.DeletedProperties) == -1) { // only undeleted
int end = UTF32.char32At(Utility.fromHex(parts[1]),0);
if (end == 0) end = cpStart;
if (end == 0) end = cpStart;
for (int j = cpStart; j <= end; ++j) {
if (j != UCD.mapToRepresentative(j, false)) continue;
@ -447,7 +460,7 @@ public final class ConvertUCD implements UCD_Types {
} else { // not range!
String val = "";
String lastVal;
for (int i = 1; i < labels.length; ++i) {
String key = labels[i];
lastVal = val;
@ -462,7 +475,7 @@ public final class ConvertUCD implements UCD_Types {
for (int cps = cpStart; cps <= cpTop; ++cps) {
if (UCD.mapToRepresentative(cps, false) != cps) continue; // skip condensed ranges
if (key.equals("binary")) {
appendCharProperties(cps, val);
} else if (key.equals("fc")) {
@ -502,7 +515,7 @@ public final class ConvertUCD implements UCD_Types {
//printValues("JOINING_TYPE", jtSet);
//printValues("JOINING_GROUP", jgSet);
}
static void printValues(String title, Set s) {
Iterator it = s.iterator();
System.out.println("public static String[] " + title + " = {");
@ -521,9 +534,9 @@ public final class ConvertUCD implements UCD_Types {
System.out.println(" LIMIT_" + title + " = " + count);
System.out.println(";");
}
static Map charData = new TreeMap();
static void writeXML() throws IOException {
System.out.println("Writing 'UCD-Main.xml'");
BufferedWriter output = new BufferedWriter(
@ -531,29 +544,29 @@ public final class ConvertUCD implements UCD_Types {
new FileOutputStream(UCD.BIN_DIR + "UCD_Data.xml"),
"UTF8"),
32*1024);
try {
// write header
output.write("<?xml version='1.0' encoding='utf-8'?>\r\n");
output.write("<UnicodeCharacterDatabase>\r\n");
output.write(" <!-- IMPORTANT: see UCD-Notes.html for information on the format. This file CANNOT be read correctly without that information. -->\r\n");
output.write(" <unicode version='" + major + "' minor='" + minor + "' update='" + update + "'/>\r\n");
output.write(" <fileVersion status='DRAFT' date='" + new Date() + "'/>\r\n");
// write blocks
Iterator it = blockData.iterator();
while (it.hasNext()) {
String[] block = (String[]) it.next();
output.write(" <block start='" + Utility.quoteXML(block[0])
output.write(" <block start='" + Utility.quoteXML(block[0])
+ "' end='" + Utility.quoteXML(block[1])
+ "' name='" + Utility.quoteXML(block[2])
+ "'/>\r\n" );
}
// write char data
it = charData.keySet().iterator();
while (it.hasNext()) {
Integer cc = (Integer) it.next();
@ -575,9 +588,9 @@ public final class ConvertUCD implements UCD_Types {
*/
output.write("/>\r\n");
}
// write footer
output.write("</UnicodeCharacterDatabase>\r\n");
} finally {
output.close();
@ -592,7 +605,7 @@ public final class ConvertUCD implements UCD_Types {
new BufferedOutputStream(
new FileOutputStream(UCD.BIN_DIR + dataFilePrefix + version + ".bin"),
128*1024));
// write header
dataOut.writeByte(BINARY_FORMAT);
dataOut.writeByte(major);
@ -603,7 +616,7 @@ public final class ConvertUCD implements UCD_Types {
dataOut.writeInt(charData.size());
System.out.println("Data Size: " + NumberFormat.getInstance().format(charData.size()));
int count = 0;
// write records
try {
// write char data
@ -612,7 +625,7 @@ public final class ConvertUCD implements UCD_Types {
Object cc = (Object) it.next();
//codePoint = UTF32.char32At(cc,0);
if (DEBUG) System.out.println(Utility.hex(cc));
UData uData = (UData) charData.get(cc);
if (false && uData.name == null) {
System.out.println("Warning: NULL name\r\n" + uData);
@ -632,13 +645,13 @@ public final class ConvertUCD implements UCD_Types {
dataOut.close();
}
}
static String[] xsSplit = new String[40];
// Cache a little bit for speed
static int getEntryCodePoint = -1;
static UData getEntryUData = null;
static UData getEntryIfExists(int cp) {
if (cp == getEntryCodePoint) return getEntryUData;
Integer cc = new Integer(cp);
@ -648,7 +661,7 @@ public final class ConvertUCD implements UCD_Types {
getEntryUData = charEntry;
return charEntry;
}
/* Get entry in table for cc
*/
static UData getEntry(int cp) {
@ -671,7 +684,7 @@ public final class ConvertUCD implements UCD_Types {
UData charEntry = getEntry(cp);
charEntry.binaryProperties |= (1 << binProp);
}
static void appendCharProperties(int cp, String key) {
int ind;
//if (true || NEWPROPS) {
@ -683,17 +696,17 @@ public final class ConvertUCD implements UCD_Types {
//charEntry.binaryProperties |= (1 << ind);
setBinaryProperty(cp, ind);
}
static Set jtSet = new TreeSet();
static Set jgSet = new TreeSet();
/** Adds the character data. Signals duplicates with an exception
*/
static void addCharData(int cp, String key, String value) {
//if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + Utility.quoteJavaString(value));
UData charEntry = getEntry(cp);
//if (cp < 10) System.out.println(" " + charEntry);
if (key.equals("bm")) {
if (value.equals("Y")) charEntry.binaryProperties |= 1;
} else if (key.equals("ce")) {
@ -723,7 +736,7 @@ public final class ConvertUCD implements UCD_Types {
}
}
setField(charEntry, key, Utility.fromHex(value));
// fix the numeric fields to be more sensible
} else if (key.equals("dd")) {
if (charEntry.numericType < UCD_Types.DECIMAL) {
@ -749,7 +762,7 @@ public final class ConvertUCD implements UCD_Types {
setField(charEntry, key, value);
}
}
static public void setField(UData uData, String fieldName, String fieldValue) {
try {
if (fieldName.equals("n")) {
@ -764,17 +777,17 @@ public final class ConvertUCD implements UCD_Types {
uData.simpleLowercase = fieldValue;
} else if (fieldName.equals("tc")) {
uData.simpleTitlecase = fieldValue;
} else if (fieldName.equals("su")) {
uData.fullUppercase = fieldValue;
} else if (fieldName.equals("sl")) {
uData.fullLowercase = fieldValue;
} else if (fieldName.equals("st")) {
uData.fullTitlecase = fieldValue;
} else if (fieldName.equals("sc")) {
uData.specialCasing = fieldValue;
} else if (fieldName.equals("xp")) {
uData.binaryProperties |= 1 << Utility.lookup(fieldValue, UCD_Names.BP);
//UCD_Names.BP_OLD
@ -796,20 +809,20 @@ public final class ConvertUCD implements UCD_Types {
uData.decompositionType = Utility.lookup(fieldValue, UCD_Names.DT);
} else if (fieldName.equals("nt")) {
uData.numericType = Utility.lookup(fieldValue, UCD_Names.NT);
} else if (fieldName.equals("ea")) {
uData.eastAsianWidth = Utility.lookup(fieldValue, UCD_Names.EA);
} else if (fieldName.equals("lb")) {
uData.lineBreak = Utility.lookup(fieldValue, UCD_Names.LB);
} else if (fieldName.equals("sn")) {
uData.script = Utility.lookup(fieldValue, UCD_Names.SCRIPT);
} else if (fieldName.equals("jt")) {
uData.joiningType = Utility.lookup(fieldValue, UCD_Names.JOINING_TYPE);
} else if (fieldName.equals("jg")) {
uData.joiningGroup = Utility.lookup(fieldValue, UCD_Names.OLD_JOINING_GROUP);
} else if (fieldName.equals("nv")) {
if (major < 2) {
if (fieldValue.equals("-")) return;
@ -827,5 +840,5 @@ public final class ConvertUCD implements UCD_Types {
"Bad field name= \"{0}\", value= \"{1}\"", new Object[] {fieldName, fieldValue}, e);
}
}
}

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
import java.util.*;
@ -6,52 +19,52 @@ import com.ibm.text.utility.*;
final class DerivedPropertyLister extends PropertyLister {
static final boolean BRIDGE = false;
static int enum = 0;
static final int
PropMath = 0,
PropAlphabetic = 1,
PropLowercase = 2,
PropUppercase = 3,
ID_Start = 4,
ID_Continue_NO_Cf = 5,
Mod_ID_Start = 6,
Mod_ID_Continue_NO_Cf = 7,
Missing_Uppercase = 8,
Missing_Lowercase = 9,
Missing_Mixedcase = 10,
FC_NFKC_Closure = 11,
FullCompExclusion = 12,
FullCompInclusion = 13,
QuickNFD = 14,
QuickNFC = 15,
QuickNFKD = 16,
QuickNFKC = 17,
ExpandsOnNFD = 18,
ExpandsOnNFC = 19,
ExpandsOnNFKD = 20,
ExpandsOnNFKC = 21,
GenNFD = 22,
GenNFC = 23,
GenNFKD = 24,
GenNFKC = 25,
LIMIT = 26;
;
private int propMask;
private Normalizer[] nf = new Normalizer[4];
private Normalizer nfd, nfc, nfkd, nfkc;
int width;
public DerivedPropertyLister(UCD ucd, int propMask, PrintStream output) {
this.propMask = propMask;
this.output = output;
@ -60,7 +73,7 @@ final class DerivedPropertyLister extends PropertyLister {
nfc = nf[1] = new Normalizer(Normalizer.NFC);
nfkd = nf[2] = new Normalizer(Normalizer.NFKD);
nfkc = nf[3] = new Normalizer(Normalizer.NFKC);
width = super.minPropertyWidth();
switch (propMask) {
case GenNFD: case GenNFC: case GenNFKD: case GenNFKC:
@ -75,7 +88,7 @@ final class DerivedPropertyLister extends PropertyLister {
break;
}
}
public String headerString() {
String result = "# Derived Property: ";
switch (propMask) {
@ -88,31 +101,31 @@ final class DerivedPropertyLister extends PropertyLister {
case GenNFD: case GenNFC: case GenNFKD: case GenNFKC:
result += NAME[propMask-GenNFD] + "\r\n# Generated according to UAX #15."
+ "\r\n# Normalized forms, where different from the characters themselves."
+ ((propMask == 5 || propMask == 3)
+ ((propMask == 5 || propMask == 3)
? ""
: "\r\n# HANGUL SYLLABLES are algorithmically decomposed, and not listed explicitly.")
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# It is NOT sufficient to replace characters one-by-one with these results!";
break;
case ID_Start: result +=
case ID_Start: result +=
"ID_Start"
+ "\r\n# Characters that can start an identifier."
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl";
break;
case ID_Continue_NO_Cf: result +=
case ID_Continue_NO_Cf: result +=
"ID_Continue"
+ "\r\n# Characters that can continue an identifier."
+ "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc"
+ "\r\n# NOTE: Cf characters should be filtered out.";
break;
case Mod_ID_Start: result +=
case Mod_ID_Start: result +=
"XID_Start"
+ "\r\n# ID_Start modified for closure under NFKx"
+ "\r\n# Modified as described in UAX #15"
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
break;
case Mod_ID_Continue_NO_Cf: result +=
case Mod_ID_Continue_NO_Cf: result +=
"XID_Continue"
+ "\r\n# Mod_ID_Continue modified for closure under NFKx"
+ "\r\n# Modified as described in UAX #15"
@ -124,7 +137,7 @@ final class DerivedPropertyLister extends PropertyLister {
result += "Math"
+ "\r\n# Generated from: Sm + Other_Math";
break;
case PropAlphabetic:
case PropAlphabetic:
result += "Alphabetic"
+ "\r\n# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic";
break;
@ -201,17 +214,17 @@ final class DerivedPropertyLister extends PropertyLister {
default: return "Unimplemented!!";
}
}
//public String optionalComment(int cp) {
// return super.optionalComment(cp) + " [" + ucdData.getCodeAndName(computedValue) + "]";
//}
public int minPropertyWidth() {
return width;
}
static final String[] NAME = {"NFD", "NFC", "NFKD", "NFKC"};
/*
public String optionalComment(int cp) {
@ -229,8 +242,8 @@ final class DerivedPropertyLister extends PropertyLister {
}
}
*/
public byte status(int cp) {
if (!ucdData.isAssigned(cp)) return EXCLUDE;
//if (cp == 0xFFFF) {
@ -240,13 +253,13 @@ final class DerivedPropertyLister extends PropertyLister {
//if (cp == 0x0385) {
// System.out.println(Utility.hex(firstRealCp));
//}
String cps;
byte xCat;
switch (propMask) {
default: return EXCLUDE;
case ExpandsOnNFD: case ExpandsOnNFC: case ExpandsOnNFKD: case ExpandsOnNFKC:
if (ucdData.getDecompositionType(cp) == NONE) return EXCLUDE;
cps = UTF32.valueOf32(cp);
@ -307,17 +320,17 @@ final class DerivedPropertyLister extends PropertyLister {
return EXCLUDE;
case FullCompExclusion:
/*
(3) Singleton Decompositions: characters that can be derived from the UnicodeData file by
(3) Singleton Decompositions: characters that can be derived from the UnicodeData file by
including all characters whose canonical decomposition consists of a single character.
(4) Non-Starter Decompositions: characters that can be derived from the UnicodeData
file by including all characters whose canonical decomposition consists of a sequence
of characters, the first of which has a non-zero combining class.
*/
*/
{
if (!ucdData.isRepresented(cp)) return EXCLUDE;
byte dtype = ucdData.getDecompositionType(cp);
if (dtype != CANONICAL) return EXCLUDE;
if (isCompEx(cp)) return INCLUDE;
return EXCLUDE;
}
@ -326,13 +339,13 @@ of characters, the first of which has a non-zero combining class.
if (!ucdData.isRepresented(cp)) return EXCLUDE;
byte dtype = ucdData.getDecompositionType(cp);
if (dtype != CANONICAL) return EXCLUDE;
if (isCompEx(cp)) return EXCLUDE;
return INCLUDE;
}
case FC_NFKC_Closure:
if (!ucdData.isRepresented(cp)) return EXCLUDE;
/*
b = Normalize(Fold(a));
c = Normalize(Fold(b));
@ -353,7 +366,7 @@ of characters, the first of which has a non-zero combining class.
}
return BREAK;
}
case QuickNFD: case QuickNFC: case QuickNFKD: case QuickNFKC:
lastValue = currentValue;
Normalizer nfx = nf[propMask - QuickNFD];
@ -364,8 +377,8 @@ of characters, the first of which has a non-zero combining class.
if (currentValue != lastValue) return BREAK;
return INCLUDE;
}
// handle script stuff
/*
if (firstRealCp == -1) return INCLUDE;
@ -373,12 +386,12 @@ of characters, the first of which has a non-zero combining class.
if (cat == cat2) return INCLUDE;
int mc = UCD.mainCategoryMask(cat);
if (LETTER_MASK == mc && mc == UCD.mainCategoryMask(cat2)) return INCLUDE;
return BREAK;
*/
return INCLUDE;
}
static Map computedValue = new HashMap();
static String getComputedValue(int cp) {
return (String) computedValue.get(new Integer(cp));
@ -388,8 +401,8 @@ of characters, the first of which has a non-zero combining class.
}
static String lastValue = "";
static String currentValue = "";
boolean isCompEx(int cp) {
boolean isCompEx(int cp) {
if (ucdData.getBinaryProperty(cp, CompositionExclusion)) return true;
String decomp = ucdData.getDecompositionMapping(cp);
if (UTF32.length32(decomp) == 1) return true;
@ -397,17 +410,17 @@ of characters, the first of which has a non-zero combining class.
if (ucdData.getCombiningClass(first) != 0) return true;
return false;
}
StringBuffer foldBuffer = new StringBuffer();
String fold(int cp) {
return ucdData.getCase(cp, FULL, FOLD);
}
String fold(String s) {
return ucdData.getCase(s, FULL, FOLD);
}
byte getDecompCat(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lu
@ -416,7 +429,7 @@ of characters, the first of which has a non-zero combining class.
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return Ll;
if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) return cat;
if (!nf[2].normalizationDiffers(cp)) return Lo;
String norm = nf[2].normalize(cp);
int cp2;
boolean gotUpper = false;
@ -437,4 +450,4 @@ of characters, the first of which has a non-zero combining class.
return cat;
}
}

View file

@ -1,23 +1,36 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
class DiffPropertyLister extends PropertyLister {
private UCD oldUCD;
public DiffPropertyLister(String oldUCDName, String newUCDName, PrintStream output) {
this.output = output;
this.ucdData = UCD.make(newUCDName);
if (oldUCDName != null) this.oldUCD = UCD.make(oldUCDName);
}
public byte status (int cp) {
return INCLUDE;
}
public String propertyName(int cp) {
return ucdData.getVersion();
}
/*
public String optionalName(int cp) {
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
@ -27,7 +40,7 @@ class DiffPropertyLister extends PropertyLister {
}
}
*/
public byte status(int lastCp, int cp) {
/*if (cp == 0xFFFF) {
@ -36,7 +49,7 @@ class DiffPropertyLister extends PropertyLister {
*/
return ucdData.isAllocated(cp) && (oldUCD == null || !oldUCD.isAllocated(cp)) ? INCLUDE : EXCLUDE;
}
public int print() {
String status;
if (oldUCD != null) {
@ -56,10 +69,10 @@ class DiffPropertyLister extends PropertyLister {
} else {
output.println("# Total " + count + " code points allocated in " + ucdData.getVersion());
}
output.println();
return count;
}
}

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
@ -8,19 +21,19 @@ import com.ibm.text.utility.*;
public class GenerateCaseFolding implements UCD_Types {
public static boolean DEBUG = false;
public static UCD ucd = UCD.make("310");
public static void main(String[] args) throws java.io.IOException {
makeCaseFold();
//getAge();
}
public static void makeCaseFold() throws java.io.IOException {
System.out.println("Making Full Data");
Map fullData = getCaseFolding(true);
System.out.println("Making Simple Data");
Map simpleData = getCaseFolding(false);
// write the data
System.out.println("Writing");
PrintWriter out = new PrintWriter(
new BufferedWriter(
@ -48,30 +61,30 @@ public class GenerateCaseFolding implements UCD_Types {
}
out.close();
}
static void drawLine(PrintWriter out, int ch, String type, String result) {
out.println(Utility.hex(ch)
+ "; " + type +
"; " + Utility.hex(result, " ") +
out.println(Utility.hex(ch)
+ "; " + type +
"; " + Utility.hex(result, " ") +
"; # " + ucd.getName(ch));
}
static Map getCaseFolding(boolean full) throws java.io.IOException {
Map data = new TreeMap();
Map repChar = new TreeMap();
//String option = "";
// get the equivalence classes
for (int ch = 0; ch < 0x10FFFF; ++ch) {
if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
if (!ucd.isRepresented(ch)) continue;
getClosure(ch, data, full);
}
// get the representative characters
Iterator it = data.keySet().iterator();
while (it.hasNext()) {
String s = (String) it.next();
@ -93,7 +106,7 @@ public class GenerateCaseFolding implements UCD_Types {
}
if (rep == null) System.err.println("No representative for: " + toString(set));
else if (repGood < 128) {
System.err.println("Non-optimal!!: "
System.err.println("Non-optimal!!: "
+ ucd.getName(rep) + ", " + toString(set,true));
}
it2 = set.iterator();
@ -104,7 +117,7 @@ public class GenerateCaseFolding implements UCD_Types {
}
return repChar;
}
static int goodness(String s, boolean full) {
if (s == null) return 0;
int result = s.length();
@ -113,7 +126,7 @@ public class GenerateCaseFolding implements UCD_Types {
return result;
}
static Normalizer NFC = new Normalizer(Normalizer.NFC);
/*
static HashSet temp = new HashSet();
@ -135,12 +148,12 @@ public class GenerateCaseFolding implements UCD_Types {
}
}
*/
/*
String
String
String lower1 = ucd.getLowercase(ch);
String lower2 = ucd.toLowercase(ch,option);
char ch2 = ucd.getLowercase(ucd.getUppercase(ch).charAt(0)).charAt(0);
//String lower1 = String.valueOf(ucd.getLowercase(ch));
//String lower = ucd.toLowercase(ch2,option);
@ -148,9 +161,9 @@ public class GenerateCaseFolding implements UCD_Types {
String lowerUpper = ucd.toLowercase(upper,option);
//String title = ucd.toTitlecase(ch2,option);
//String lowerTitle = ucd.toLowercase(upper,option);
if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { //
output.println(Utility.hex(ch)
if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { //
output.println(Utility.hex(ch)
+ "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E")
+ "; " + Utility.hex(lowerUpper," ")
+ ";\t#" + ucd.getName(ch)
@ -163,7 +176,7 @@ public class GenerateCaseFolding implements UCD_Types {
//}
}
*/
static void getClosure(int ch, Map data, boolean full) {
String charStr = UTF32.valueOf32(ch);
String lowerStr = lower(charStr, full);
@ -171,17 +184,17 @@ public class GenerateCaseFolding implements UCD_Types {
String upperStr = upper(charStr, full);
if (charStr.equals(lowerStr) && charStr.equals(upperStr) && charStr.equals(titleStr)) return;
if (DEBUG) System.err.println("Closure for " + Utility.hex(ch));
// make new set
Set set = new TreeSet();
set.add(charStr);
data.put(charStr, set);
// add cases to get started
add(set, lowerStr, data);
add(set, upperStr, data);
add(set, titleStr, data);
// close it
main:
while (true) {
@ -197,15 +210,15 @@ public class GenerateCaseFolding implements UCD_Types {
break;
}
}
static String lower(String s, boolean full) {
String result = lower2(s,full);
return result.replace('\u03C2', '\u03C3'); // HACK for lower
}
// These functions are no longer necessary, since UCD is parameterized,
// but it's not worth changing
static String lower2(String s, boolean full) {
if (!full) {
if (s.length() != 1) return s;
@ -213,7 +226,7 @@ public class GenerateCaseFolding implements UCD_Types {
}
return ucd.getCase(s, FULL, LOWER);
}
static String upper(String s, boolean full) {
if (!full) {
if (s.length() != 1) return s;
@ -221,7 +234,7 @@ public class GenerateCaseFolding implements UCD_Types {
}
return ucd.getCase(s, SIMPLE, UPPER);
}
static String title(String s, boolean full) {
if (!full) {
if (s.length() != 1) return s;
@ -229,7 +242,7 @@ public class GenerateCaseFolding implements UCD_Types {
}
return ucd.getCase(s, SIMPLE, TITLE);
}
static boolean add(Set set, String s, Map data) {
if (set.contains(s)) return false;
set.add(s);
@ -246,7 +259,7 @@ public class GenerateCaseFolding implements UCD_Types {
if (DEBUG) System.err.println("done adding: " + toString(set));
return true;
}
static String toString(Set set) {
String result = "{";
Iterator it2 = set.iterator();
@ -259,7 +272,7 @@ public class GenerateCaseFolding implements UCD_Types {
}
return result + "}";
}
static String toString(Set set, boolean t) {
String result = "{";
Iterator it2 = set.iterator();
@ -272,7 +285,7 @@ public class GenerateCaseFolding implements UCD_Types {
}
return result + "}";
}
static final void getAge() throws IOException {
PrintStream log = new PrintStream(
new BufferedOutputStream (
@ -298,37 +311,37 @@ public class GenerateCaseFolding implements UCD_Types {
UnicodeSet u21 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.1.txt", false);
UnicodeSet u30 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.0.txt", false);
UnicodeSet u31 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.1.txt", false);
log.println();
log.println("# Code points assigned in Unicode 1.1 (minus Hangul Syllables): "
log.println("# Code points assigned in Unicode 1.1 (minus Hangul Syllables): "
+ n.format(u11.count()));
log.println();
u11.print(log, false, false, "1.1");
UnicodeSet u20m = new UnicodeSet(u20).remove(u11);
log.println();
log.println("# Code points assigned in Unicode 2.0 (minus Unicode 1.1): "
log.println("# Code points assigned in Unicode 2.0 (minus Unicode 1.1): "
+ n.format(u20m.count()));
log.println();
u20m.print(log, false, false, "2.0");
UnicodeSet u21m = new UnicodeSet(u21).remove(u20);
log.println();
log.println("# Code points assigned in Unicode 2.1 (minus Unicode 2.0): "
log.println("# Code points assigned in Unicode 2.1 (minus Unicode 2.0): "
+ n.format(u21m.count()));
log.println();
u21m.print(log, false, false, "2.1");
UnicodeSet u30m = new UnicodeSet(u30).remove(u21);
log.println();
log.println("# Code points assigned in Unicode 3.0 (minus Unicode 2.1): "
log.println("# Code points assigned in Unicode 3.0 (minus Unicode 2.1): "
+ n.format(u30m.count()));
log.println();
u30m.print(log, false, false, "3.0");
UnicodeSet u31m = new UnicodeSet(u31).remove(u30);
log.println();
log.println("# Code points assigned in Unicode 3.1 (minus Unicode 3.0): "
log.println("# Code points assigned in Unicode 3.1 (minus Unicode 3.0): "
+ n.format(u31m.count()));
log.println();
u31m.print(log, false, false, "3.1");
@ -336,7 +349,7 @@ public class GenerateCaseFolding implements UCD_Types {
} finally {
if (log != null) log.close();
}
}
}

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
@ -8,21 +21,21 @@ import java.text.SimpleDateFormat;
import com.ibm.text.utility.*;
public class GenerateData implements UCD_Types {
public static void main (String[] args) throws IOException {
System.out.println("START");
ucd = UCD.make();
System.out.println("Loaded UCD " + ucd.getVersion() + " " + (new Date(ucd.getDate())));
String version = ucd.getVersion();
for (int i = 0; i < args.length; ++i) {
String arg = args[i];
if (arg.charAt(0) == '#') return; // skip rest of line
int mask = 0;
Utility.fixDot();
System.out.println("Argument: " + args[i]);
if (arg.equalsIgnoreCase("version")) {
version = args[++i];
ucd = UCD.make(version);
@ -37,13 +50,13 @@ public class GenerateData implements UCD_Types {
"DerivedBidiClass-" + version );
} else if (arg.equalsIgnoreCase("DerivedNormalizationProperties")) {
mask = Utility.setBits(0, DerivedPropertyLister.FC_NFKC_Closure, DerivedPropertyLister.ExpandsOnNFKC);
mask = Utility.clearBit(mask, DerivedPropertyLister.FullCompInclusion);
mask = Utility.clearBit(mask, DerivedPropertyLister.FullCompInclusion);
generateDerived(mask, HEADER_DERIVED, "DerivedNormalizationProperties-" + version );
} else if (arg.equalsIgnoreCase("DerivedEastAsianWidth")) {
generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedEastAsianWidth-" + version );
} else if (arg.equalsIgnoreCase("DerivedGeneralCategory")) {
generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedGeneralCategory-" + version );
} else if (arg.equalsIgnoreCase("DerivedCombiningClass")) {
generateVerticalSlice(COMBINING_CLASS, COMBINING_CLASS+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
@ -91,30 +104,30 @@ public class GenerateData implements UCD_Types {
System.out.println(" ! Unknown option -- must be one of the following (case-insensitive)");
System.out.println(" ! generateCompExclusions,...");
}
//checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F");
//checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD");
//generateDerived(Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf),
//generateDerived(Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf),
// HEADER_DERIVED, "DerivedPropData2-" + version );
//generateVerticalSlice(SCRIPT, SCRIPT+1, KEEP_SPECIAL, "ScriptCommon-" + version );
//listStrings("LowerCase-" + version , 0,0);
//generateVerticalSlice(0, LIMIT_ENUM, SKIP_SPECIAL, PROPLIST1, "DerivedPropData1-" + version );
// AGE stuff
//UCD ucd = UCD.make();
//System.out.println(ucd.getAgeID(0x61));
//System.out.println(ucd.getAgeID(0x2FA1D));
//
}
System.out.println("END");
}
static Normalizer nfkc = new Normalizer(Normalizer.NFKC);
public static void checkHoffman(String test) {
String result = nfkc.normalize(test);
System.out.println(Utility.hex(test) + " => " + Utility.hex(result));
@ -123,7 +136,7 @@ public class GenerateData implements UCD_Types {
System.out.println();
show(result, 0);
}
public static void show(String s, int indent) {
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
@ -137,16 +150,16 @@ public class GenerateData implements UCD_Types {
}
}
}
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd' 'HH:mm:ss.S' GMT'");
static {
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
}
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
public static String fixFile(String s) {
int len = s.length();
if (!s.endsWith(".txt")) return s;
@ -156,9 +169,9 @@ public class GenerateData implements UCD_Types {
System.out.println("Fixing File Name");
return s.substring(0,len-6) + s.substring(len-4);
}
static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2;
public static void doHeader(String fileName, PrintStream output, int headerChoice) {
output.println("# " + fileName + ".txt");
output.println("#");
@ -179,7 +192,7 @@ public class GenerateData implements UCD_Types {
output.println("# ================================================");
output.println();
}
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
ucd = UCD.make("310");
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + fileName));
@ -194,13 +207,13 @@ public class GenerateData implements UCD_Types {
}
output.close();
}
/*
public static void listStrings(String file, int type, int subtype) throws IOException {
ucd = UCD.make("310");
UCD ucd30 = UCD.make("300");
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
for (int i = 0; i < 0x10FFFF; ++i) {
if ((i & 0xFFF) == 0) System.out.println("# " + i);
if (!ucd.isRepresented(i)) continue;
@ -215,17 +228,17 @@ public class GenerateData implements UCD_Types {
output.close();
}
*/
public static void generateCompExclusions() throws IOException {
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "CompositionExclusionsDelta.txt"));
new CompLister(output).print();
output.close();
}
static class CompLister extends PropertyLister {
UCD oldUCD;
int oldLength = 0;
public CompLister(PrintStream output) {
this.output = output;
ucdData = UCD.make("310");
@ -236,7 +249,7 @@ public class GenerateData implements UCD_Types {
return UTF32.length32(ucdData.getDecompositionMapping(cp)) + "";
}
public byte status(int cp) {
if (ucdData.getDecompositionType(cp) == CANONICAL
if (ucdData.getDecompositionType(cp) == CANONICAL
&& oldUCD.getDecompositionType(cp) != CANONICAL) {
int temp = oldLength;
oldLength = UTF32.length32(ucdData.getDecompositionMapping(cp));
@ -246,11 +259,11 @@ public class GenerateData implements UCD_Types {
return EXCLUDE;
}
}
public static void partitionProperties() throws IOException {
// find properties
int count = 0;
int[] props = new int[500];
for (int i = 1; i < LIMIT_ENUM; ++i) { // || iType == SCRIPT
@ -260,7 +273,7 @@ public class GenerateData implements UCD_Types {
props[count++] = i;
}
System.out.println("props: " + count);
BitSet probe = new BitSet();
Map map = new HashMap();
int total = 0;
@ -269,12 +282,12 @@ public class GenerateData implements UCD_Types {
int cat = ucd.getCategory(cp);
if (cat == UNASSIGNED || cat == PRIVATE_USE || cat == SURROGATE) continue;
if (!ucd.isAllocated(cp)) continue;
for (int i = 0; i < count; ++i) {
boolean iProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, props[i]);
if (iProp) probe.set(i); else probe.clear(i);
}
++total;
if (!map.containsKey(probe)) {
map.put(probe.clone(), UTF32.valueOf32(cp));
@ -282,27 +295,27 @@ public class GenerateData implements UCD_Types {
System.out.println("Set Size: " + map.size() + ", total: " + total + ", " + ucd.getCodeAndName(cp));
}
}
Utility.fixDot();
System.out.println("Set Size: " + map.size());
}
public static void listDifferences() throws IOException {
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "PropertyDifferences.txt"));
for (int i = 1; i < LIMIT_ENUM; ++i) {
int iType = i & 0xFF00;
if (iType == JOINING_GROUP || iType == AGE || iType == COMBINING_CLASS || iType == SCRIPT) continue;
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
String iNameShort = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.SHORT);
String iNameLong = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.LONG);
System.out.println();
System.out.println();
System.out.println(iNameLong);
output.println("#" + iNameLong);
int last = -1;
for (int j = i+1; j < LIMIT_ENUM; ++j) {
int jType = j & 0xFF00;
@ -320,17 +333,17 @@ public class GenerateData implements UCD_Types {
System.out.print('.');
}
System.out.flush();
int bothCount = 0, i_jPropCount = 0, j_iPropCount = 0, iCount = 0, jCount = 0;
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
int cat = ucd.getCategory(cp);
if (cat == UNASSIGNED || cat == PRIVATE_USE || cat == SURROGATE) continue;
if (!ucd.isAllocated(cp)) continue;
boolean iProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, i);
boolean jProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, j);
if (jProp) ++jCount;
if (iProp) {
++iCount;
@ -339,21 +352,21 @@ public class GenerateData implements UCD_Types {
} else if (jProp) ++j_iPropCount;
}
if (iCount == 0 || jCount == 0) continue;
String jNameShort = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, j, MyPropertyLister.SHORT);
//String jNameLong = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, j, MyPropertyLister.LONG);
String rel = bothCount == 0 ? "DISJOINT"
: i_jPropCount == 0 && j_iPropCount == 0 ? "EQUALS"
: i_jPropCount == 0 ? "CONTAINS" // depends on reverse output
: j_iPropCount == 0 ? "CONTAINS"
: "OVERLAPS";
if (j_iPropCount > i_jPropCount) {
// reverse output
output.println(jNameShort + "\t" + iNameShort + "\t" + rel
+ "\t" + bothCount + "\t" + j_iPropCount + "\t" + i_jPropCount);
} else {
} else {
output.println(iNameShort + "\t" + jNameShort + "\t" + rel
+ "\t" + bothCount + "\t" + i_jPropCount + "\t" + j_iPropCount);
}
@ -361,8 +374,8 @@ public class GenerateData implements UCD_Types {
}
output.close();
}
public static void listProperties() {
for (int i = 0; i < LIMIT_ENUM; ++i) {
int type = i & 0xFF00;
@ -373,7 +386,7 @@ public class GenerateData implements UCD_Types {
else if (value.equals("<unused>")) continue;
String abbvalue = MyPropertyLister.getUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.SHORT);
if (abbvalue.length() == 0) abbvalue = "no";
if (type == COMBINING_CLASS) {
value = MyPropertyLister.getCombiningName(i);
if (value.length() == 0) {
@ -382,32 +395,32 @@ public class GenerateData implements UCD_Types {
}
abbvalue = value;
}
String elide = "";
if (type == CATEGORY || type == SCRIPT || type == BINARY_PROPERTIES) elide = "\\p{"
if (type == CATEGORY || type == SCRIPT || type == BINARY_PROPERTIES) elide = "\\p{"
+ abbvalue
+ "}";
String abb = "";
if (type != BINARY_PROPERTIES) abb = "\\p{"
+ UCD_Names.ABB_UNIFIED_PROPERTIES[i>>8]
if (type != BINARY_PROPERTIES) abb = "\\p{"
+ UCD_Names.ABB_UNIFIED_PROPERTIES[i>>8]
+ "="
+ abbvalue
+ "}";
String norm = "";
if (type != BINARY_PROPERTIES) norm = "\\p{"
+ UCD_Names.SHORT_UNIFIED_PROPERTIES[i>>8]
if (type != BINARY_PROPERTIES) norm = "\\p{"
+ UCD_Names.SHORT_UNIFIED_PROPERTIES[i>>8]
+ "="
+ value
+ "}";
System.out.println("<tr><td>" + elide + "</td><td>" + abb + "</td><td>" + norm + "</td></tr>");
}
}
static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
public static void generateVerticalSlice(int startEnum, int endEnum, byte skipSpecial,
public static void generateVerticalSlice(int startEnum, int endEnum, byte skipSpecial,
int headerChoice, String file) throws IOException {
//System.out.println(ucd.toString(0x1E0A));
/*
System.out.println(ucd.getData(0xFFFF));
@ -418,14 +431,14 @@ public class GenerateData implements UCD_Types {
if (true) return;
String test2 = ucd.getName(0x2A6D6);
//*/
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file + "dX.txt"));
doHeader(file, output, headerChoice);
int last = -1;
for (int i = startEnum; i < endEnum; ++i) {
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|| i == (BINARY_PROPERTIES | Non_break)
|| i == (JOINING_TYPE | JT_U)
|| i == (JOINING_GROUP | NO_SHAPING)
@ -447,7 +460,7 @@ public class GenerateData implements UCD_Types {
output.println();
}
System.out.print(".");
new MyPropertyLister(ucd, i, output).print();
new MyPropertyLister(ucd, i, output).print();
}
if (endEnum == LIMIT_ENUM) {
output.println();
@ -457,7 +470,7 @@ public class GenerateData implements UCD_Types {
output.println();
System.out.println();
System.out.println("@NUMERIC VALUES");
Set floatSet = new TreeSet();
for (int i = 0; i < 0x10FFFF; ++i) {
float nv = ucd.getNumericValue(i);
@ -474,21 +487,21 @@ public class GenerateData implements UCD_Types {
output.close();
System.out.println();
}
static UCD ucd;
static public Normalizer formC, formD, formKC, formKD;
static public void writeNormalizerTestSuite(String fileName) throws IOException {
ucd = UCD.make();
PrintWriter log = Utility.openPrintWriter(fileName);
formC = new Normalizer(Normalizer.NFC);
formD = new Normalizer(Normalizer.NFD);
formKC = new Normalizer(Normalizer.NFKC);
formKD = new Normalizer(Normalizer.NFKD);
String[] example = new String[256];
log.println("# " + fixFile(fileName));
@ -522,24 +535,24 @@ public class GenerateData implements UCD_Types {
log.println("# implementations:");
log.println("#");
log.println("# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)");
System.out.println("Writing Part 1");
log.println("#");
log.println("@Part0 # Specific cases");
log.println("#");
for (int j = 0; j < testSuiteCases.length; ++j) {
writeLine(testSuiteCases[j], log, false);
}
System.out.println("Writing Part 2");
log.println("#");
log.println("@Part1 # Character by character test");
log.println("# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.");
log.println("#");
for (int ch = 0; ch < 0x10FFFF; ++ch) {
Utility.dot(ch);
if (!ucd.isAssigned(ch)) continue;
@ -548,7 +561,7 @@ public class GenerateData implements UCD_Types {
writeLine(cc,log, true);
}
Utility.fixDot();
System.out.println("Finding Examples");
for (int ch = 0; ch < 0x10FFFF; ++ch) {
@ -558,24 +571,24 @@ public class GenerateData implements UCD_Types {
int cc = ucd.getCombiningClass(ch);
if (example[cc] == null) example[cc] = UTF32.valueOf32(ch);
}
Utility.fixDot();
System.out.println("Writing Part 2");
log.println("#");
log.println("@Part2 # Canonical Order Test");
log.println("#");
for (int ch = 0; ch < 0x10FFFF; ++ch) {
Utility.dot(ch);
if (!ucd.isAssigned(ch)) continue;
if (ucd.isPUA(ch)) continue;
short c = ucd.getCombiningClass(ch);
if (c == 0) continue;
// add character with higher class, same class, lower class
String sample = "";
for (int i = c+1; i < example.length; ++i) {
if (example[i] == null) continue;
@ -588,7 +601,7 @@ public class GenerateData implements UCD_Types {
sample += example[i];
break;
}
writeLine("a" + sample + UTF32.valueOf32(ch) + "b", log, false);
writeLine("a" + UTF32.valueOf32(ch) + sample + "b", log, false);
}
@ -597,14 +610,14 @@ public class GenerateData implements UCD_Types {
log.println("# END OF FILE");
log.close();
}
static void writeLine(String cc, PrintWriter log, boolean check) {
String c = formC.normalize(cc);
String d = formD.normalize(cc);
String kc = formKC.normalize(cc);
String kd = formKD.normalize(cc);
if (check & cc.equals(c) && cc.equals(d) && cc.equals(kc) && cc.equals(kd)) return;
// consistency check
String dc = formD.normalize(c);
String dkc = formD.normalize(kc);
@ -613,18 +626,18 @@ public class GenerateData implements UCD_Types {
Normalizer.SHOW_PROGRESS = true;
d = formD.normalize(cc);
}
// printout
log.println(
Utility.hex(cc," ") + ";" + Utility.hex(c," ") + ";" + Utility.hex(d," ") + ";"
+ Utility.hex(kc," ") + ";" + Utility.hex(kd," ")
+ "; # ("
+ "; # ("
+ comma(cc) + "; " + comma(c) + "; " + comma(d) + "; " + comma(kc) + "; " + comma(kd) + "; "
+ ") " + ucd.getName(cc));
}
static StringBuffer commaResult = new StringBuffer();
// not recursive!!!
static final String comma(String s) {
commaResult.setLength(0);
@ -636,7 +649,7 @@ public class GenerateData implements UCD_Types {
}
return commaResult.toString();
}
static final String[] testSuiteCases = {
"\u1E0A",
"\u1E0C",

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MLStreamWriter.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.utility;
import java.io.*;

View file

@ -1,23 +1,36 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyFloatLister.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
class MyFloatLister extends PropertyLister {
private float propMask;
public MyFloatLister(UCD ucd, float f, PrintStream output) {
this.propMask = f;
this.output = output;
this.ucdData = ucd;
}
public String propertyName(int cp) {
return ""+ucdData.getNumericValue(cp);
}
public String optionalName(int cp) {
return ucdData.getNumericTypeID(cp);
}
public byte status(int cp) {
//if ((cp & 0xFFF) == 0) System.out.println("# " + Utility.hex(cp));
if (!ucdData.isRepresented(cp)) {
@ -28,4 +41,4 @@ class MyFloatLister extends PropertyLister {
return ucdData.getNumericValue(cp) == propMask ? INCLUDE : EXCLUDE;
}
}

View file

@ -1,21 +1,34 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyPropertyLister.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
import com.ibm.text.utility.*;
final class MyPropertyLister extends PropertyLister {
static final boolean BRIDGE = false;
private int propMask;
public MyPropertyLister(UCD ucd, int propMask, PrintStream output) {
this.propMask = propMask;
this.output = output;
this.ucdData = ucd;
if (propMask < COMBINING_CLASS) usePropertyComment = false; // skip gen cat
}
static String getCombiningName (int propMask) {
String s = "";
switch (propMask & 0xFF) {
@ -46,7 +59,7 @@ final class MyPropertyLister extends PropertyLister {
}
return s;
}
public String headerString() {
int main = (propMask & 0xFF00);
if (main == COMBINING_CLASS) {
@ -63,18 +76,18 @@ final class MyPropertyLister extends PropertyLister {
return "# " + shortID + (shortID.equals(longID) ? "" : "\t(" + longID + ")");
}
}
public String propertyName(int cp) {
return getUnifiedBinaryPropertyID(propMask);
}
public String optionalComment(int cp) {
if (propMask < COMBINING_CLASS) return ""; // skip gen cat
int cat = ucdData.getCategory(cp);
if (cat == Lt || cat == Ll || cat == Lu) return "L&";
return ucdData.getCategoryID(cp);
}
/*
public String optionalName(int cp) {
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
@ -84,7 +97,7 @@ final class MyPropertyLister extends PropertyLister {
}
}
*/
public byte status(int cp) {
//if (cp == 0xFFFF) {
// System.out.println("# " + Utility.hex(cp));
@ -93,7 +106,7 @@ final class MyPropertyLister extends PropertyLister {
//if (cp == 0x0385) {
// System.out.println(Utility.hex(firstRealCp));
//}
if (cat == Cn
&& propMask != (BINARY_PROPERTIES | Noncharacter_Code_Point)
&& propMask != (BINARY_PROPERTIES | Reserved_Cf_Code_Point)
@ -101,7 +114,7 @@ final class MyPropertyLister extends PropertyLister {
if (BRIDGE) return CONTINUE;
else return EXCLUDE;
}
boolean inSet = getUnifiedBinaryProperty(cp, propMask);
/*
if (cp >= 0x1D400 && cp <= 0x1D7C9 && cat != Cn) {
@ -119,7 +132,7 @@ final class MyPropertyLister extends PropertyLister {
if (!inSet) return EXCLUDE;
return INCLUDE;
}
/**
* @return unified property number
*/
@ -141,12 +154,12 @@ final class MyPropertyLister extends PropertyLister {
case AGE>>8: return propMask < LIMIT_AGE;
default: return false;
}
}
}
public boolean getUnifiedBinaryProperty(int cp, int propMask) {
return getUnifiedBinaryProperty(ucdData, cp, propMask);
}
static public boolean getUnifiedBinaryProperty(UCD ucd, int cp, int propMask) {
int enum = propMask >> 8;
propMask &= 0xFF;
@ -177,21 +190,21 @@ final class MyPropertyLister extends PropertyLister {
return ucd.getAge(cp) == propMask;
}
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
}
}
static final int SHORT = -1, NORMAL = 0, LONG = 1, BOTH = 2;
public String getUnifiedBinaryPropertyID(int unifiedPropMask) {
return getUnifiedBinaryPropertyID(ucdData, unifiedPropMask, NORMAL);
}
public static String getUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask) {
String longOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, LONG);
String shortOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, SHORT);
if (longOne.equals(shortOne)) return longOne;
return shortOne + "(" + longOne + ")";
}
public static String getFullUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask, int style) {
String pre = "";
if ((unifiedPropMask & 0xFF00) != BINARY_PROPERTIES) {
@ -205,7 +218,7 @@ final class MyPropertyLister extends PropertyLister {
if (shortOne.length() == 0) shortOne = "xx";
String longOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, LONG);
if (longOne.length() == 0) longOne = "none";
String post;
if (style < LONG) post = shortOne;
else if (style == LONG || shortOne.equals(longOne)) post = longOne;
@ -215,10 +228,10 @@ final class MyPropertyLister extends PropertyLister {
pre = post + "=";
post = "T";
}
return pre + post;
}
public static String getUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask, int style) {
int enum = unifiedPropMask >> 8;
byte propMask = (byte)unifiedPropMask;
@ -264,7 +277,7 @@ final class MyPropertyLister extends PropertyLister {
return ucd.getAgeID_fromIndex(propMask);
}
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
}
}
}

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
@ -16,13 +29,13 @@ import com.ibm.text.utility.*;
* in connection with or arising out of the use of the information here.
* @author Mark Davis
*/
public final class Normalizer implements UCD_Types {
public static final String copyright =
public static final String copyright =
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
public static boolean SHOW_PROGRESS = false;
/**
* Create a normalizer for a given form.
*/
@ -31,41 +44,41 @@ public final class Normalizer implements UCD_Types {
this.compatibility = (form & COMPATIBILITY_MASK) != 0;
this.data = getData(unicodeVersion);
}
/**
* Create a normalizer for a given form.
*/
public Normalizer(byte form) {
this(form,"");
}
/**
* Masks for the form selector
*/
public static final byte
public static final byte
COMPATIBILITY_MASK = 1,
COMPOSITION_MASK = 2;
/**
* Normalization Form Selector
*/
public static final byte
NFD = 0 ,
public static final byte
NFD = 0 ,
NFKD = COMPATIBILITY_MASK,
NFC = COMPOSITION_MASK,
NFKC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
/**
* Normalizes text according to the chosen form,
* Normalizes text according to the chosen form,
* replacing contents of the target buffer.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
public StringBuffer normalize(String source, StringBuffer target) {
// First decompose the source into target,
// then compose if the form requires.
if (source.length() != 0) {
internalDecompose(source, target);
if (composition) {
@ -83,7 +96,7 @@ public final class Normalizer implements UCD_Types {
public String normalize(String source) {
return normalize(source, new StringBuffer()).toString();
}
/**
* Normalizes text according to the chosen form
* @param source the original text, unnormalized
@ -92,18 +105,18 @@ public final class Normalizer implements UCD_Types {
public String normalize(int cp) {
return normalize(UTF16.valueOf(cp));
}
/**
*/
private StringBuffer hasDecompositionBuffer = new StringBuffer();
public boolean hasDecomposition(int cp) {
hasDecompositionBuffer.setLength(0);
normalize(UTF16.valueOf(cp), hasDecompositionBuffer);
if (hasDecompositionBuffer.length() != 1) return true;
return cp != hasDecompositionBuffer.charAt(0);
}
/**
* Does a quick check to see if the string is in the current form. Checks canonical order and
* isAllowed().
@ -112,7 +125,7 @@ public final class Normalizer implements UCD_Types {
*/
/*
public static final int NO = 0, YES = 1, MAYBE = -1;
public int quickCheck(String source) {
short lastCanonicalClass = 0;
int result = YES;
@ -128,7 +141,7 @@ public final class Normalizer implements UCD_Types {
}
return result;
}
/**
* Find whether the given character is allowed in the current form.
* @return YES, NO, MAYBE
@ -153,7 +166,7 @@ public final class Normalizer implements UCD_Types {
}
return YES;
}
/**
* Utility: Gets the combining class of a character from the
* Unicode Character Database. Only a byte is needed, but since they are signed in Java
@ -161,13 +174,13 @@ public final class Normalizer implements UCD_Types {
* @param ch the source character
* @return value from 0 to 255
*/
public short getCanonicalClass(char ch) {
return data.getCanonicalClass(ch);
}
/**
* Utility: Checks whether there is a recursive decomposition of a character from the
* Utility: Checks whether there is a recursive decomposition of a character from the
* Unicode Character Database. It is compatibility or canonical according to the particular
* normalizer.
* @param ch the source character
@ -175,11 +188,11 @@ public final class Normalizer implements UCD_Types {
public boolean normalizationDiffers(int ch) {
return data.normalizationDiffers(ch, composition, compatibility);
}
/**
* Utility: Gets recursive decomposition of a character from the
* Utility: Gets recursive decomposition of a character from the
* Unicode Character Database.
* @param compatibility If false selects the recursive
* @param compatibility If false selects the recursive
* canonical decomposition, otherwise selects
* the recursive compatibility AND canonical decomposition.
* @param ch the source character
@ -188,7 +201,7 @@ public final class Normalizer implements UCD_Types {
public void getRecursiveDecomposition(char ch, StringBuffer buffer) {
data.getRecursiveDecomposition(ch, buffer, compatibility);
}
/**
* Utility: Gets composition mapping.
* @return IntEnumeration with the pair -> value mapping, where the
@ -199,18 +212,18 @@ public final class Normalizer implements UCD_Types {
public IntHashtable.IntEnumeration getComposition() {
return data.getComposition();
}
*/
public boolean isTrailing(int cp) {
return this.composition ? data.isTrailing(cp) : false;
}
// ======================================
// PRIVATES
// ======================================
/**
* The current form.
*/
@ -221,7 +234,7 @@ public final class Normalizer implements UCD_Types {
* Decomposes text, either canonical or compatibility,
* replacing contents of the target buffer.
* @param form the normalization form. If COMPATIBILITY_MASK
* bit is on in this byte, then selects the recursive
* bit is on in this byte, then selects the recursive
* compatibility decomposition, otherwise selects
* the recursive canonical decomposition.
* @param source the original text, unnormalized
@ -234,20 +247,20 @@ public final class Normalizer implements UCD_Types {
buffer.setLength(0);
ch32 = UTF16.charAt(source, i);
data.getRecursiveDecomposition(ch32, buffer, compatibility);
// add all of the characters in the decomposition.
// (may be just the original character, if there was
// no decomposition mapping)
int ch;
for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
ch = UTF16Plus.charAt(buffer, j);
int chClass = data.getCanonicalClass(ch);
int k = target.length(); // insertion point
if (chClass != 0) {
// bubble-sort combining marks as necessary
int ch2;
for (; k > 0; k -= UTF16.getCharCount(ch2)) {
ch2 = UTF16Plus.charAt(target, k-1);
@ -273,9 +286,9 @@ public final class Normalizer implements UCD_Types {
int lastClass = data.getCanonicalClass(starterCh);
if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
int oldLen = target.length();
// Loop on the decomposed characters, combining where possible
int ch;
for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
ch = UTF16Plus.charAt(target, decompPos);
@ -317,7 +330,7 @@ public final class Normalizer implements UCD_Types {
private BitSet canonicalRecompose = new BitSet();
private BitSet compatibilityRecompose = new BitSet();
static final int NOT_COMPOSITE = 0xFFFF;
Stub(String version) {
ucd = UCD.make(version);
for (int i = 0; i < 0x10FFFF; ++i) {
@ -336,23 +349,23 @@ public final class Normalizer implements UCD_Types {
}
int a = UTF16.charAt(s, 0);
if (ucd.getCombiningClass(a) != 0) continue;
int b = UTF16.charAt(s, UTF16.getCharCount(a));
isSecond.set(b);
// have a recomposition, so set the bit
canonicalRecompose.set(i);
// set the compatibility recomposition bit
// set the compatibility recomposition bit
// ONLY if the component characters
// don't compatibility decompose
if (ucd.getDecompositionType(a) <= CANONICAL
&& ucd.getDecompositionType(b) <= CANONICAL) {
compatibilityRecompose.set(i);
}
long key = (((long)a)<<32) | b;
/*if (i == '\u1E0A' || key == 0x004400000307) {
System.out.println(Utility.hex(s));
System.out.println(Utility.hex(i));
@ -392,15 +405,15 @@ Problem: differs: true, call: false U+1FDE GREEK DASIA AND OXIA
Problem: differs: true, call: false U+1FDF GREEK DASIA AND PERISPOMENI
Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
*/
short getCanonicalClass(int cp) {
return ucd.getCombiningClass(cp);
}
boolean isTrailing(int cp) {
return isSecond.get(cp);
}
boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) {
byte dt = ucd.getDecompositionType(cp);
if (!composition) {
@ -413,7 +426,7 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
else return dt == CANONICAL && !canonicalRecompose.get(cp);
}
}
public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compatibility) {
byte dt = ucd.getDecompositionType(cp);
// we know we decompose all CANONICAL, plus > CANONICAL if compatibility is TRUE.
@ -427,7 +440,7 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
UTF16.append(buffer, cp);
}
}
int getPairwiseComposition(int starterCh, int ch) {
int hangulPoss = UCD.composeHangul(starterCh, ch);
if (hangulPoss != 0xFFFF) return hangulPoss;
@ -435,17 +448,17 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
if (obj == null) return 0xFFFF;
return ((Integer)obj).intValue();
}
}
/**
* Contains normalization data from the Unicode Character Database.
* use false for the minimal set, true for the real set.
* use false for the minimal set, true for the real set.
*/
private Stub data;
private static HashMap versionCache = new HashMap();
private static Stub getData (String version) {
if (version.length() == 0) version = UCD.latestVersion;
Stub result = (Stub)versionCache.get(version);
@ -455,7 +468,7 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
}
return result;
}
/**
* Just accessible for testing.
*/
@ -463,7 +476,7 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
boolean isExcluded (char ch) {
return data.isExcluded(ch);
}
/**
* Just accessible for testing.
*/

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/PropertyLister.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
@ -5,11 +18,11 @@ import com.ibm.text.utility.*;
abstract public class PropertyLister implements UCD_Types {
static final boolean COMPRESS_NAMES = false;
static final boolean DROP_INDICATORS = true;
protected UCD ucdData;
protected PrintStream output;
protected boolean showOnConsole;
@ -17,37 +30,37 @@ abstract public class PropertyLister implements UCD_Types {
protected int firstRealCp = -2;
protected int lastRealCp = -2;
protected boolean alwaysBreaks = false; // set to true if property only breaks
public static final byte INCLUDE = 0, BREAK = 1, CONTINUE = 2, EXCLUDE = 3;
/**
/**
* @return status. Also have access to firstRealCp, lastRealCp
*/
abstract public byte status(int cp);
public String headerString() {
return "";
}
public String propertyName(int cp) {
return "";
}
public String optionalName(int cp) {
return "";
}
public String optionalComment(int cp) {
if (!usePropertyComment) return "";
int cat = ucdData.getCategory(cp);
if (cat == Lt || cat == Ll || cat == Lu) return "L&";
return ucdData.getCategoryID(cp);
}
public int minPropertyWidth() {
return 1;
}
public void format(int startCp, int endCp, int realCount) {
try {
String prop = propertyName(startCp);
@ -65,7 +78,7 @@ abstract public class PropertyLister implements UCD_Types {
String count = (bridge == 0) ? "" + realCount : realCount + "/" + bridge;
String countStr = Utility.repeat(" ", 3-count.length()) + "[" + count + "] ";
String gap = Utility.repeat(" ", 12 - width(startCp) - width(endCp));
line = Utility.hex(startCp,4) + ".." + Utility.hex(endCp,4) + gap
+ prop + opt + pgap + " # " + optCom
+ countStr;
@ -75,7 +88,7 @@ abstract public class PropertyLister implements UCD_Types {
if (com == 0) {
line += startName + ".." + endName;
} else {
line += startName.substring(0,com)
line += startName.substring(0,com)
+ "(" + startName.substring(com) + ".." + endName.substring(com) + ")";
}
}
@ -93,17 +106,17 @@ abstract public class PropertyLister implements UCD_Types {
output.println(line);
if (showOnConsole) System.out.println(line);
} catch (Exception e) {
throw new ChainException("Format error {0}, {1}",
throw new ChainException("Format error {0}, {1}",
new Object[]{new Integer(startCp), new Integer(endCp)}, e);
}
}
int width(int cp) {
return cp <= 0xFFFF ? 4
: cp <= 0xFFFFF ? 5
return cp <= 0xFFFF ? 4
: cp <= 0xFFFFF ? 5
: 6;
}
String getKenName(int cp) {
String result = ucdData.getName(cp);
if (result == null) return "";
@ -113,8 +126,8 @@ abstract public class PropertyLister implements UCD_Types {
}
return result;
}
/**
* @return common initial substring length ending with SPACE or HYPHEN-MINUS. 0 if there is none
*/
@ -136,14 +149,14 @@ abstract public class PropertyLister implements UCD_Types {
}
return lastSpace;
}
public int print() {
int count = 0;
firstRealCp = -1;
byte firstRealCpCat = -1;
lastRealCp = -1;
int realRangeCount = 0;
String header = headerString();
if (header.length() != 0) {
output.println(header);
@ -156,7 +169,7 @@ abstract public class PropertyLister implements UCD_Types {
if (cat == Lt || cat == Ll) cat = Lu;
if (cat != firstRealCpCat) s = BREAK;
}
switch(s) {
case CONTINUE:
break; // do nothing
@ -177,7 +190,7 @@ abstract public class PropertyLister implements UCD_Types {
lastRealCp = firstRealCp = cp;
firstRealCpCat = ucdData.getCategory(firstRealCp);
if (firstRealCpCat == Lt || firstRealCpCat == Ll) firstRealCpCat = Lu;
realRangeCount = 1;
count++;
break;
@ -193,7 +206,7 @@ abstract public class PropertyLister implements UCD_Types {
if (firstRealCp != -1) {
format(firstRealCp, lastRealCp, realRangeCount);
}
if (count == 0) System.out.println("WARNING -- ZERO COUNT FOR " + header);
output.println();
output.println("# Total code points: " + count);

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
@ -8,31 +21,31 @@ import java.text.SimpleDateFormat;
import com.ibm.text.utility.*;
public class TestData implements UCD_Types {
public static void main (String[] args) throws IOException {
System.out.println("START");
ucd = UCD.make();
System.out.println("Loaded UCD " + ucd.getVersion() + " " + (new Date(ucd.getDate())));
checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F");
checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD");
int mask = 0;
if (false) {
generateVerticalSlice(BIDI_CLASS, BIDI_CLASS+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedBidiClass-3.1.1d1.txt");
mask = Utility.setBits(0, DerivedPropertyLister.FC_NFKC_Closure, DerivedPropertyLister.ExpandsOnNFKC);
mask = Utility.clearBit(mask, DerivedPropertyLister.FullCompInclusion);
mask = Utility.clearBit(mask, DerivedPropertyLister.FullCompInclusion);
generateDerived(mask, HEADER_DERIVED, "DerivedNormalizationProperties-3.1.0d1.txt");
generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedEastAsianWidth-3.1.0d1.txt");
generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedGeneralCategory-3.1.0d1.txt");
generateVerticalSlice(COMBINING_CLASS, COMBINING_CLASS+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedCombiningClass-3.1.0d1.txt");
@ -53,41 +66,41 @@ public class TestData implements UCD_Types {
mask = Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf);
generateDerived(mask, HEADER_DERIVED, "DerivedCoreProperties-3.1.0d1.txt");
generateVerticalSlice(LINE_BREAK, LINE_BREAK+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedLineBreak-3.1.0d1.txt");
generateVerticalSlice(SCRIPT+1, SCRIPT + NEXT_ENUM, KEEP_SPECIAL, HEADER_SCRIPTS, "Scripts-3.1.0d4.txt");
generateVerticalSlice(BINARY_PROPERTIES + White_space, BINARY_PROPERTIES + Noncharacter_Code_Point + 1,
KEEP_SPECIAL, HEADER_EXTEND, "PropList-3.1.0d5.txt");
writeNormalizerTestSuite("NormalizationTest-3.1.0d1.txt");
}
//generateDerived(Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf),
//generateDerived(Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf),
// HEADER_DERIVED, "DerivedPropData2-3.1.0d1.txt");
//generateVerticalSlice(SCRIPT, SCRIPT+1, KEEP_SPECIAL, "ScriptCommon-3.1.0d1.txt");
//listStrings("LowerCase-3.1.0d1.txt", 0,0);
//generateVerticalSlice(0, LIMIT_ENUM, SKIP_SPECIAL, PROPLIST1, "DerivedPropData1-3.1.0d1.txt");
// AGE stuff
//UCD ucd = UCD.make();
//System.out.println(ucd.getAgeID(0x61));
//System.out.println(ucd.getAgeID(0x2FA1D));
//generateCompExclusions();
System.out.println("END");
}
static Normalizer nfkc = new Normalizer(Normalizer.NFKC);
public static void checkHoffman(String test) {
String result = nfkc.normalize(test);
System.out.println(Utility.hex(test) + " => " + Utility.hex(result));
@ -96,7 +109,7 @@ public class TestData implements UCD_Types {
System.out.println();
show(result, 0);
}
public static void show(String s, int indent) {
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
@ -110,16 +123,16 @@ public class TestData implements UCD_Types {
}
}
}
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd' 'HH:mm:ss.S' GMT'");
static {
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
}
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
public static String fixFile(String s) {
int len = s.length();
if (!s.endsWith(".txt")) return s;
@ -129,9 +142,9 @@ public class TestData implements UCD_Types {
System.out.println("Fixing File Name");
return s.substring(0,len-6) + s.substring(len-4);
}
static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2;
public static void doHeader(String fileName, PrintStream output, int headerChoice) {
output.println("# " + fixFile(fileName));
output.println("#");
@ -152,7 +165,7 @@ public class TestData implements UCD_Types {
output.println("# ================================================");
output.println();
}
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
ucd = UCD.make("310");
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + fileName));
@ -167,13 +180,13 @@ public class TestData implements UCD_Types {
}
output.close();
}
/*
public static void listStrings(String file, int type, int subtype) throws IOException {
ucd = UCD.make("310");
UCD ucd30 = UCD.make("300");
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
for (int i = 0; i < 0x10FFFF; ++i) {
if ((i & 0xFFF) == 0) System.out.println("# " + i);
if (!ucd.isRepresented(i)) continue;
@ -188,17 +201,17 @@ public class TestData implements UCD_Types {
output.close();
}
*/
public static void generateCompExclusions() throws IOException {
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "CompositionExclusionsDelta.txt"));
new CompLister(output).print();
output.close();
}
static class CompLister extends PropertyLister {
UCD oldUCD;
int oldLength = 0;
public CompLister(PrintStream output) {
this.output = output;
ucdData = UCD.make("310");
@ -209,7 +222,7 @@ public class TestData implements UCD_Types {
return UTF32.length32(ucdData.getDecompositionMapping(cp)) + "";
}
public byte status(int cp) {
if (ucdData.getDecompositionType(cp) == CANONICAL
if (ucdData.getDecompositionType(cp) == CANONICAL
&& oldUCD.getDecompositionType(cp) != CANONICAL) {
int temp = oldLength;
oldLength = UTF32.length32(ucdData.getDecompositionMapping(cp));
@ -219,11 +232,11 @@ public class TestData implements UCD_Types {
return EXCLUDE;
}
}
static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
public static void generateVerticalSlice(int startEnum, int endEnum, byte skipSpecial, int headerChoice, String file) throws IOException {
//System.out.println(ucd.toString(0x1E0A));
/*
System.out.println(ucd.getData(0xFFFF));
@ -234,14 +247,14 @@ public class TestData implements UCD_Types {
if (true) return;
String test2 = ucd.getName(0x2A6D6);
//*/
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
doHeader(file, output, headerChoice);
int last = -1;
for (int i = startEnum; i < endEnum; ++i) {
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|| i == (CATEGORY | UNUSED_CATEGORY)
|| i == (BINARY_PROPERTIES | Non_break)
|| i == (JOINING_TYPE | JT_U)
@ -265,7 +278,7 @@ public class TestData implements UCD_Types {
output.println();
}
System.out.print(".");
new MyPropertyLister(ucd, i, output).print();
new MyPropertyLister(ucd, i, output).print();
}
if (endEnum == LIMIT_ENUM) {
output.println();
@ -275,7 +288,7 @@ public class TestData implements UCD_Types {
output.println();
System.out.println();
System.out.println("@NUMERIC VALUES");
Set floatSet = new TreeSet();
for (int i = 0; i < 0x10FFFF; ++i) {
float nv = ucd.getNumericValue(i);
@ -292,13 +305,13 @@ public class TestData implements UCD_Types {
output.close();
System.out.println();
}
static UCD ucd;
static public Normalizer formC, formD, formKC, formKD;
static public void writeNormalizerTestSuite(String fileName) throws IOException {
PrintWriter log = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
@ -309,7 +322,7 @@ public class TestData implements UCD_Types {
formD = new Normalizer(Normalizer.NFD);
formKC = new Normalizer(Normalizer.NFKC);
formKD = new Normalizer(Normalizer.NFKD);
log.println("# " + fixFile(fileName));
log.println("#");
log.println("# Normalization Test Suite");
@ -341,24 +354,24 @@ public class TestData implements UCD_Types {
log.println("# implementations:");
log.println("#");
log.println("# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)");
System.out.println("Writing Part 1");
log.println("#");
log.println("@Part0 # Specific cases");
log.println("#");
for (int j = 0; j < testSuiteCases.length; ++j) {
writeLine(testSuiteCases[j], log, false);
}
System.out.println("Writing Part 2");
log.println("#");
log.println("@Part1 # Character by character test");
log.println("# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.");
log.println("#");
for (int ch = 0; ch < 0x10FFFF; ++ch) {
Utility.dot(ch);
if (!ucd.isAssigned(ch)) continue;
@ -367,7 +380,7 @@ public class TestData implements UCD_Types {
writeLine(cc,log, true);
}
Utility.fixDot();
System.out.println("Finding Examples");
String[] example = new String[256];
@ -379,7 +392,7 @@ public class TestData implements UCD_Types {
int cc = ucd.getCombiningClass(ch);
if (example[cc] == null) example[cc] = UTF32.valueOf32(ch);
}
Utility.fixDot();
System.out.println("Writing Part 3");
@ -393,9 +406,9 @@ public class TestData implements UCD_Types {
if (ucd.isPUA(ch)) continue;
short c = ucd.getCombiningClass(ch);
if (c == 0) continue;
// add character with higher class, same class, lower class
String sample = "";
for (int i = c+1; i < example.length; ++i) {
if (example[i] == null) continue;
@ -408,7 +421,7 @@ public class TestData implements UCD_Types {
sample += example[i];
break;
}
writeLine("a" + sample + UTF32.valueOf32(ch) + "b", log, false);
writeLine("a" + UTF32.valueOf32(ch) + sample + "b", log, false);
}
@ -417,7 +430,7 @@ public class TestData implements UCD_Types {
log.println("# END OF FILE");
log.close();
}
static void writeLine(String cc, PrintWriter log, boolean check) {
String c = formC.normalize(cc);
String d = formD.normalize(cc);
@ -427,13 +440,13 @@ public class TestData implements UCD_Types {
log.println(
Utility.hex(cc," ") + ";" + Utility.hex(c," ") + ";" + Utility.hex(d," ") + ";"
+ Utility.hex(kc," ") + ";" + Utility.hex(kd," ")
+ "; # ("
+ "; # ("
+ comma(cc) + "; " + comma(c) + "; " + comma(d) + "; " + comma(kc) + "; " + comma(kd) + "; "
+ ") " + ucd.getName(cc));
}
static StringBuffer commaResult = new StringBuffer();
// not recursive!!!
static final String comma(String s) {
commaResult.setLength(0);
@ -445,7 +458,7 @@ public class TestData implements UCD_Types {
}
return commaResult.toString();
}
static final String[] testSuiteCases = {
"\u1E0A",
"\u1E0C",

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNormalization.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
@ -8,56 +21,56 @@ import com.ibm.text.utility.*;
public final class TestNormalization {
static final String DIR = "C:\\Documents and Settings\\Davis\\My Documents\\UnicodeData\\Update 3.0.1\\";
static final boolean SKIP_FILE = true;
static PrintWriter out = null;
static BufferedReader in = null;
static Normalizer nfc;
static Normalizer nfd;
static Normalizer nfkc;
static Normalizer nfkd;
static UCD ucd;
static BitSet charsListed = new BitSet(0x110000);
static int errorCount = 0;
static int lineErrorCount = 0;
static String originalLine = "";
static String lastLine = "";
public static void main(String[] args) throws java.io.IOException {
System.out.println("Creating Normalizers");
ucd = UCD.make("");
nfc = new Normalizer(Normalizer.NFC);
nfd = new Normalizer(Normalizer.NFD);
nfkc = new Normalizer(Normalizer.NFKC);
nfkd = new Normalizer(Normalizer.NFKD);
String x = UTF32.valueOf32(0x10000);
check("NFC", nfc, x);
check("NFD", nfd, x);
check("NFKC", nfkc, x);
check("NFKD", nfkd, x);
out = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream("NormalizationTestLog.txt"),
"UTF8"),
32*1024));
in = new BufferedReader (
new FileReader (DIR + "NormalizationTest.txt"),
32*1024);
try {
String[] parts = new String[10];
System.out.println("Checking files");
int count = 0;
while (true) {
String line = in.readLine();
if ((count++ & 0x3FF) == 0) System.out.println("#LINE: " + line);
@ -69,69 +82,69 @@ public final class TestNormalization {
}
line = line.trim();
if (line.length() == 0) continue;
int splitCount = Utility.split(line, ';', parts);
// FIX check splitCount
for (int i = 0; i < splitCount; ++i) {
parts[i] = Utility.fromHex(parts[i]);
}
if (UTF32.length32(parts[0]) == 1) {
int code = UTF32.char32At(parts[0],0);
charsListed.set(code);
if ((code & 0x3FF) == 0) System.out.println("# " + Utility.hex(code));
}
// c2 == NFC(c1) == NFC(c2) == NFC(c3)
errorCount += check("NFCa", nfc, parts[1], parts[0]);
errorCount += check("NFCb", nfc, parts[1], parts[1]);
errorCount += check("NFCc", nfc, parts[1], parts[2]);
errorCount += check("NFCa", nfc, parts[1], parts[0]);
errorCount += check("NFCb", nfc, parts[1], parts[1]);
errorCount += check("NFCc", nfc, parts[1], parts[2]);
// c4 == NFC(c4) == NFC(c5)
errorCount += check("NFCd", nfc, parts[3], parts[3]);
errorCount += check("NFCe", nfc, parts[3], parts[4]);
errorCount += check("NFCd", nfc, parts[3], parts[3]);
errorCount += check("NFCe", nfc, parts[3], parts[4]);
// c3 == NFD(c1) == NFD(c2) == NFD(c3)
errorCount += check("NFDa", nfd, parts[2], parts[0]);
errorCount += check("NFDb", nfd, parts[2], parts[1]);
errorCount += check("NFDc", nfd, parts[2], parts[2]);
errorCount += check("NFDa", nfd, parts[2], parts[0]);
errorCount += check("NFDb", nfd, parts[2], parts[1]);
errorCount += check("NFDc", nfd, parts[2], parts[2]);
// c5 == NFD(c4) == NFD(c5)
errorCount += check("NFDd", nfd, parts[4], parts[3]);
errorCount += check("NFDe", nfd, parts[4], parts[4]);
errorCount += check("NFDd", nfd, parts[4], parts[3]);
errorCount += check("NFDe", nfd, parts[4], parts[4]);
// c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
errorCount += check("NFKCa", nfkc, parts[3], parts[0]);
errorCount += check("NFKCb", nfkc, parts[3], parts[1]);
errorCount += check("NFKCc", nfkc, parts[3], parts[2]);
errorCount += check("NFKCd", nfkc, parts[3], parts[3]);
errorCount += check("NFKCe", nfkc, parts[3], parts[4]);
errorCount += check("NFKCa", nfkc, parts[3], parts[0]);
errorCount += check("NFKCb", nfkc, parts[3], parts[1]);
errorCount += check("NFKCc", nfkc, parts[3], parts[2]);
errorCount += check("NFKCd", nfkc, parts[3], parts[3]);
errorCount += check("NFKCe", nfkc, parts[3], parts[4]);
// c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
errorCount += check("NFKDa", nfkd, parts[4], parts[0]);
errorCount += check("NFKDb", nfkd, parts[4], parts[1]);
errorCount += check("NFKDc", nfkd, parts[4], parts[2]);
errorCount += check("NFKDd", nfkd, parts[4], parts[3]);
errorCount += check("NFKDe", nfkd, parts[4], parts[4]);
errorCount += check("NFKDa", nfkd, parts[4], parts[0]);
errorCount += check("NFKDb", nfkd, parts[4], parts[1]);
errorCount += check("NFKDc", nfkd, parts[4], parts[2]);
errorCount += check("NFKDd", nfkd, parts[4], parts[3]);
errorCount += check("NFKDe", nfkd, parts[4], parts[4]);
}
System.out.println("Total errors in file: " + errorCount
+ ", lines: " + lineErrorCount);
errorCount = lineErrorCount = 0;
System.out.println("Checking Missing");
checkMissing();
System.out.println("Total errors in unlisted items: " + errorCount
+ ", lines: " + lineErrorCount);
} finally {
if (in != null) in.close();
if (out != null) out.close();
}
}
static String lastBase = "";
public static int check(String type, Normalizer n, String base, String other) {
try {
String trans = n.normalize(other);
@ -149,8 +162,8 @@ public final class TestNormalization {
if (!base.equals(other)) {
otherList = "(" + ucd.getCodeAndName(other) + ")";
}
out.println("DIFF " + type + ": "
+ ucd.getCodeAndName(base) + " != "
out.println("DIFF " + type + ": "
+ ucd.getCodeAndName(base) + " != "
+ type
+ otherList
+ " == " + ucd.getCodeAndName(trans)
@ -159,17 +172,17 @@ public final class TestNormalization {
return 1;
}
} catch (Exception e) {
throw new ChainException("DIFF " + type + ": "
+ ucd.getCodeAndName(base) + " != "
throw new ChainException("DIFF " + type + ": "
+ ucd.getCodeAndName(base) + " != "
+ type + "(" + ucd.getCodeAndName(other) + ")", new Object[]{}, e);
}
return 0;
}
public static int check(String type, Normalizer n, String base) {
return check(type, n, base, base);
}
static void checkMissing() {
for (int missing = 0; missing < 0x100000; ++missing) {
if ((missing & 0xFFF) == 0) System.out.println("# " + Utility.hex(missing));
@ -180,6 +193,6 @@ public final class TestNormalization {
errorCount += check("NFKC", nfkc, x);
errorCount += check("NFKD", nfkd, x);
}
}
}
}

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.HashMap;
@ -16,14 +29,14 @@ public final class UCD implements UCD_Types {
* Used for the default version.
*/
public static final String latestVersion = "3.1.1";
/**
* Create singleton instance for default (latest) version
*/
public static UCD make() {
return make("");
}
/**
* Create singleton instance for the specific version
*/
@ -37,21 +50,21 @@ public final class UCD implements UCD_Types {
}
return result;
}
/**
* Get the version of the UCD
*/
public String getVersion() {
return version;
}
/**
* Get the date that the data was parsed
*/
public long getDate() {
return date;
}
/**
* Is the code point allocated?
*/
@ -64,14 +77,14 @@ public final class UCD implements UCD_Types {
if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF && major >= 3 && minor >= 1) return true;
return false;
}
/**
* Is the code point assigned to a character (or surrogate)
*/
public boolean isAssigned(int codePoint) {
return getCategory(codePoint) != Cn;
}
/**
* Is the code point a PUA character (fast check)
*/
@ -80,7 +93,7 @@ public final class UCD implements UCD_Types {
|| codePoint >= 0xF0000 && codePoint < 0xFFFFE
|| codePoint >= 0x100000 && codePoint < 0x10FFFE);
}
/**
* Many ranges are elided in the UCD. All but the first are not actually
* represented in the data internally. This detects such cases.
@ -88,21 +101,21 @@ public final class UCD implements UCD_Types {
public boolean isRepresented(int codePoint) {
return getRaw(codePoint) != null;
}
/**
* Return XML version of the data associated with the code point.
*/
public String toString(int codePoint) {
return get(codePoint, true).toString(FULL);
}
}
/**
* Get the character name.
*/
public String getName(int codePoint) {
return get(codePoint, true).name;
}
/**
* Get the character names for the code points in a string, separated by ", "
*/
@ -117,14 +130,14 @@ public final class UCD implements UCD_Types {
}
return result.toString();
}
/**
* Get the code in U+ notation
*/
public static String getCode(int codePoint) {
return "U+" + Utility.hex(codePoint);
}
/**
* Get the code in U+ notation
*/
@ -139,14 +152,14 @@ public final class UCD implements UCD_Types {
}
return result.toString();
}
/**
* Get the name and number (U+xxxx NAME) for a code point
*/
public String getCodeAndName(int codePoint) {
return getCode(codePoint) + " " + getName(codePoint);
}
/**
* Get the name and number (U+xxxx NAME) for the code points in a string,
* separated by ", "
@ -163,14 +176,14 @@ public final class UCD implements UCD_Types {
}
return result.toString();
}
/**
* Get the general category
*/
public byte getCategory(int codePoint) {
return get(codePoint, false).generalCategory;
}
/**
* Get the main category, as a mask
*/
@ -187,7 +200,7 @@ public final class UCD implements UCD_Types {
}
throw new IllegalArgumentException ("Illegal General Category " + cat);
}
/**
* Get the combining class, a number between zero and 255. Returned
* as a short to avoid the signed-byte problem in Java
@ -195,46 +208,46 @@ public final class UCD implements UCD_Types {
public short getCombiningClass(int codePoint) {
return (short)(get(codePoint, false).combiningClass & 0xFF);
}
/**
* Does this combining class actually occur in this version of the data.
*/
public boolean isCombiningClassUsed(byte value) {
return combiningClassSet.get(0xFF & value);
}
/**
* Get the bidi class
*/
public byte getBidiClass(int codePoint) {
return get(codePoint, false).bidiClass;
}
/**
* Get the RAW decomposition mapping. Must be used recursively for the full mapping!
*/
public String getDecompositionMapping(int codePoint) {
return get(codePoint, true).decompositionMapping;
}
/**
* Get BIDI mirroring character, if there is one.
*/
public String getBidiMirror(int codePoint) {
return get(codePoint, true).bidiMirror;
}
/**
* Get the RAW decomposition type: the <...> field in the UCD data.
*/
public byte getDecompositionType(int codePoint) {
return get(codePoint, false).decompositionType;
}
public float getNumericValue(int codePoint) {
return get(codePoint, false).numericValue;
}
public byte getNumericType(int codePoint) {
return get(codePoint, false).numericType;
}
@ -242,11 +255,11 @@ public final class UCD implements UCD_Types {
public String getCase(int codePoint, byte simpleVsFull, byte caseType) {
return getCase(codePoint, simpleVsFull, caseType, "");
}
public String getCase(String s, byte simpleVsFull, byte caseType) {
return getCase(s, simpleVsFull, caseType, "");
}
public String getCase(int codePoint, byte simpleVsFull, byte caseType, String condition) {
UData udata = get(codePoint, true);
if (caseType < LOWER || caseType > FOLD
@ -255,7 +268,7 @@ public final class UCD implements UCD_Types {
}
if (caseType < FOLD) {
if (simpleVsFull == FULL && udata.specialCasing.length() != 0) {
if (condition.length() == 0
if (condition.length() == 0
|| udata.specialCasing.indexOf(condition) < 0) {
simpleVsFull = SIMPLE;
}
@ -268,7 +281,7 @@ public final class UCD implements UCD_Types {
else simpleVsFull = FULL;
}
}
switch (caseType + simpleVsFull) {
case SIMPLE + UPPER: return udata.simpleUppercase;
case SIMPLE + LOWER: return udata.simpleLowercase;
@ -281,7 +294,7 @@ public final class UCD implements UCD_Types {
}
throw new IllegalArgumentException("getCase: " + caseType + ", " + simpleVsFull);
}
public String getCase(String s, byte simpleVsFull, byte caseType, String condition) {
if (UTF32.length32(s) == 1) return getCase(UTF32.char32At(s, 0), simpleVsFull, caseType);
StringBuffer result = new StringBuffer();
@ -291,7 +304,7 @@ public final class UCD implements UCD_Types {
cp = UTF32.char32At(s, i);
String mappedVersion = getCase(cp, simpleVsFull, currentCaseType, condition);
result.append(mappedVersion);
if (caseType == TITLE) {
if (caseType == TITLE) {
// if letter is cased, change to lowercase, otherwise change to TITLE
byte cat = getCategory(cp);
if (cat == Mn || cat == Me || cat == Mc) {
@ -307,60 +320,60 @@ public final class UCD implements UCD_Types {
}
return result.toString();
}
/*
public String getSimpleLowercase(int codePoint) {
return get(codePoint, true).simpleLowercase;
}
public String getSimpleUppercase(int codePoint) {
return get(codePoint, true).simpleUppercase;
}
public String getSimpleTitlecase(int codePoint) {
return get(codePoint, true).simpleTitlecase;
}
public String getSimpleCaseFolding(int codePoint) {
return get(codePoint, true).simpleCaseFolding;
}
public String getFullLowercase(int codePoint) {
return get(codePoint, true).fullLowercase;
}
public String getFullUppercase(int codePoint) {
return get(codePoint, true).fullUppercase;
}
public String getFullTitlecase(int codePoint) {
return get(codePoint, true).fullTitlecase;
}
public String getFullCaseFolding(int codePoint) {
return get(codePoint, true).simpleCaseFolding;
}
public String getLowercase(int codePoint, boolean full) {
if (full) return getFullLowercase(codePoint);
return getSimpleLowercase(codePoint);
}
public String getUppercase(int codePoint, boolean full) {
if (full) return getFullUppercase(codePoint);
return getSimpleLowercase(codePoint);
}
public String getTitlecase(int codePoint, boolean full) {
if (full) return getFullTitlecase(codePoint);
return getSimpleTitlecase(codePoint);
}
public String getCaseFolding(int codePoint, boolean full) {
if (full) return getFullCaseFolding(codePoint);
return getSimpleCaseFolding(codePoint);
}
public String getLowercase(String s, boolean full) {
if (s.length() == 1) return getLowercase(s.charAt(0), true);
StringBuffer result = new StringBuffer();
@ -372,7 +385,7 @@ public final class UCD implements UCD_Types {
}
return result.toString();
}
public String getUppercase(String s, boolean full) {
if (s.length() == 1) return getUppercase(s.charAt(0), true);
StringBuffer result = new StringBuffer();
@ -384,7 +397,7 @@ public final class UCD implements UCD_Types {
}
return result.toString();
}
public String getTitlecase(String s, boolean full) {
if (s.length() == 1) return getTitlecase(s.charAt(0), true);
StringBuffer result = new StringBuffer();
@ -396,7 +409,7 @@ public final class UCD implements UCD_Types {
}
return result.toString();
}
public String getCaseFolding(String s, boolean full) {
if (s.length() == 1) return getCaseFolding(s.charAt(0), true);
StringBuffer result = new StringBuffer();
@ -409,184 +422,184 @@ public final class UCD implements UCD_Types {
return result.toString();
}
*/
public String getSpecialCase(int codePoint) {
return get(codePoint, true).specialCasing;
}
public byte getEastAsianWidth(int codePoint) {
return get(codePoint, false).eastAsianWidth;
}
public byte getLineBreak(int codePoint) {
return get(codePoint, false).lineBreak;
}
public byte getScript(int codePoint) {
return get(codePoint, false).script;
}
public byte getAge(int codePoint) {
return get(codePoint, false).age;
}
public byte getJoiningType(int codePoint) {
return get(codePoint, false).joiningType;
}
public byte getJoiningGroup(int codePoint) {
return get(codePoint, false).joiningGroup;
}
public int getBinaryProperties(int codePoint) {
return get(codePoint, false).binaryProperties;
}
}
public boolean getBinaryProperty(int codePoint, int bit) {
return (get(codePoint, false).binaryProperties & (1<<bit)) != 0;
}
}
// ENUM Mask Utilties
public int getCategoryMask(int codePoint) {
return 1<<get(codePoint, false).generalCategory;
}
public int getBidiClassMask(int codePoint) {
return 1<<get(codePoint, false).bidiClass;
}
public int getNumericTypeMask(int codePoint) {
return 1<<get(codePoint, false).numericType;
}
public int getDecompositionTypeMask(int codePoint) {
return 1<<get(codePoint, false).decompositionType;
}
public int getEastAsianWidthMask(int codePoint) {
return 1<<get(codePoint, false).eastAsianWidth;
}
public int getLineBreakMask(int codePoint) {
return 1<<get(codePoint, false).lineBreak;
}
public int getScriptMask(int codePoint) {
return 1<<get(codePoint, false).script;
}
public int getAgeMask(int codePoint) {
return 1<<get(codePoint, false).age;
}
public int getJoiningTypeMask(int codePoint) {
return 1<<get(codePoint, false).joiningType;
}
public int getJoiningGroupMask(int codePoint) {
return 1<<get(codePoint, false).joiningGroup;
}
// VERSIONS WITH NAMES
public String getCategoryID(int codePoint) {
return getCategoryID_fromIndex(getCategory(codePoint));
}
public static String getCategoryID_fromIndex(byte prop) {
return UCD_Names.GC[prop];
}
public String getBidiClassID(int codePoint) {
return getBidiClassID_fromIndex(getBidiClass(codePoint));
}
public static String getBidiClassID_fromIndex(byte prop) {
return UCD_Names.BC[prop];
}
public String getCombiningClassID(int codePoint) {
return getCombiningClassID_fromIndex(getCombiningClass(codePoint));
}
public static String getCombiningClassID_fromIndex(short cc) {
return cc + "";
}
public String getDecompositionTypeID(int codePoint) {
return getDecompositionTypeID_fromIndex(getDecompositionType(codePoint));
}
public static String getDecompositionTypeID_fromIndex(byte prop) {
return UCD_Names.DT[prop];
}
public String getNumericTypeID(int codePoint) {
return getNumericTypeID_fromIndex(getNumericType(codePoint));
}
public static String getNumericTypeID_fromIndex(byte prop) {
return UCD_Names.NT[prop];
}
public String getEastAsianWidthID(int codePoint) {
return getEastAsianWidthID_fromIndex(getEastAsianWidth(codePoint));
}
public static String getEastAsianWidthID_fromIndex(byte prop) {
return UCD_Names.EA[prop];
}
public String getLineBreakID(int codePoint) {
return getLineBreakID_fromIndex(getLineBreak(codePoint));
}
public static String getLineBreakID_fromIndex(byte prop) {
return UCD_Names.LB[prop];
}
public String getJoiningTypeID(int codePoint) {
return getJoiningTypeID_fromIndex(getJoiningType(codePoint));
}
public static String getJoiningTypeID_fromIndex(byte prop) {
return UCD_Names.JOINING_TYPE[prop];
}
public String getJoiningGroupID(int codePoint) {
return getJoiningGroupID_fromIndex(getJoiningGroup(codePoint));
}
public static String getJoiningGroupID_fromIndex(byte prop) {
return UCD_Names.JOINING_GROUP[prop];
}
public String getScriptID(int codePoint) {
return getScriptID_fromIndex(getScript(codePoint));
}
public static String getScriptID_fromIndex(byte prop) {
return UCD_Names.SCRIPT[prop];
}
public String getAgeID(int codePoint) {
return getAgeID_fromIndex(getAge(codePoint));
}
public static String getAgeID_fromIndex(byte prop) {
return UCD_Names.AGE[prop];
}
public String getBinaryPropertiesID(int codePoint, byte bit) {
return (getBinaryProperties(codePoint) & (1<<bit)) != 0 ? "Y" : "N";
}
public static String getBinaryPropertiesID_fromIndex(byte bit) {
return UCD_Names.BP[bit];
}
public static int mapToRepresentative(int ch, boolean old) {
if (ch <= 0xFFFD) {
//if (ch <= 0x2800) return ch;
@ -624,7 +637,7 @@ public final class UCD implements UCD_Types {
}
return ch;
}
public boolean isIdentifierStart(int cp, boolean extended) {
if (extended) {
if (cp == 0x0E33 || cp == 0x0EB3 || cp == 0xFF9E || cp == 0xFF9F) return false;
@ -635,7 +648,7 @@ public final class UCD implements UCD_Types {
if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl) return true;
return false;
}
public boolean isIdentifierContinue_NO_Cf(int cp, boolean extended) {
if (isIdentifierStart(cp, extended)) return true;
if (extended) {
@ -646,7 +659,7 @@ public final class UCD implements UCD_Types {
if (cat == Mn || cat == Mc || cat == Nd || cat == Pc) return true;
return false;
}
public boolean isIdentifier(String s, boolean extended) {
if (s.length() == 0) return false; // at least one!
int cp;
@ -661,34 +674,34 @@ public final class UCD implements UCD_Types {
return true;
}
/*
Middle Dot. Because most Catalan legacy data will be encoded in Latin-1, U+00B7 MIDDLE DOT needs to be
Middle Dot. Because most Catalan legacy data will be encoded in Latin-1, U+00B7 MIDDLE DOT needs to be
allowed in <identifier_extend>.
In particular, the following four characters should be in <identifier_extend> and not <identifier_start>:
0E33 THAI CHARACTER SARA AM
0EB3 LAO VOWEL SIGN AM
FF9E HALFWIDTH KATAKANA VOICED SOUND MARK
FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
In particular, the following four characters should be in <identifier_extend> and not <identifier_start>:
0E33 THAI CHARACTER SARA AM
0EB3 LAO VOWEL SIGN AM
FF9E HALFWIDTH KATAKANA VOICED SOUND MARK
FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
Irregularly decomposing characters. U+037A GREEK YPOGEGRAMMENI and certain Arabic presentation
forms have irregular compatibility decompositions, and need to be excluded from both <identifier_start>
and <identifier_extend>. It is recommended that all Arabic presentation forms be excluded from identifiers
in any event, although only a few of them are required to be excluded for normalization
to guarantee identifier closure.
to guarantee identifier closure.
*/
// *******************
// *******************
// PRIVATES
// *******************
// *******************
// cache of singletons
private static Map versionCache = new HashMap();
private static final int LIMIT_CODE_POINT = 0x110000;
private static final UData[] ALL_NULLS = new UData[1024];
// main data
private UData[][] data = new UData[LIMIT_CODE_POINT>>10][];
// extras
private BitSet combiningClassSet = new BitSet(256);
private String version;
@ -699,19 +712,19 @@ to guarantee identifier closure.
private byte minor = -1;
private byte update = -1;
private int size = -1;
// cache last UData
private int lastCode = Integer.MIN_VALUE;
private UData lastResult = UData.UNASSIGNED;
private boolean lastCodeFixed = false;
// hide constructor
private UCD() {
for (int i = 0; i < data.length; ++i) {
data[i] = ALL_NULLS;
}
}
private void add(UData uData) {
int high = uData.codePoint>>10;
if (data[high] == ALL_NULLS) {
@ -720,7 +733,7 @@ to guarantee identifier closure.
}
data[high][uData.codePoint & 0x3FF] = uData;
}
public boolean hasComputableName(int codePoint) {
if (codePoint >= 0xF900 && codePoint <= 0xFA2D) return true;
int rangeStart = mapToRepresentative(codePoint, major < 2);
@ -744,11 +757,11 @@ to guarantee identifier closure.
return true;
}
}
private UData getRaw(int codePoint) {
return data[codePoint>>10][codePoint & 0x3FF];
}
// access data for codepoint
UData get(int codePoint, boolean fixStrings) {
//if (codePoint == lastCode && fixStrings <= lastCodeFixed) return lastResult;
@ -756,7 +769,7 @@ to guarantee identifier closure.
// we play some funny tricks for performance
// if cp is not represented, it is either in a elided block or missing.
// elided blocks are either CONTINUE or FFFF
byte cat;
if (!ucdData.isRepresented(cp)) {
int rep = UCD.mapToRepresentative(cp);
@ -768,9 +781,9 @@ to guarantee identifier closure.
cat = ucdData.getCategory(cp);
}
*/
UData result = null;
// do range stuff
String constructedName = null;
int rangeStart = mapToRepresentative(codePoint, major < 2);
@ -820,7 +833,7 @@ to guarantee identifier closure.
if (fixStrings) result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
return result;
}
result.codePoint = codePoint;
if (fixStrings) {
result.name = constructedName;
@ -835,10 +848,10 @@ to guarantee identifier closure.
}
return result;
}
// Hangul constants
static final int
static final int
SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
LCount = 19, VCount = 21, TCount = 28,
NCount = VCount * TCount, // 588
@ -859,9 +872,9 @@ to guarantee identifier closure.
// if (true) return "?";
return UCD_Names.JAMO_L_TABLE[LIndex] + UCD_Names.JAMO_V_TABLE[VIndex] + UCD_Names.JAMO_T_TABLE[TIndex];
}
private static final char[] pair = new char[2];
static String getHangulDecompositionPair(int ch) {
int SIndex = ch - SBase;
if (0 > SIndex || SIndex >= SCount) {
@ -877,7 +890,7 @@ to guarantee identifier closure.
}
return String.valueOf(pair);
}
static int composeHangul(int char1, int char2) {
if (LBase <= char1 && char1 < LLimit && VBase <= char2 && char2 < VLimit) {
return (SBase + ((char1 - LBase) * VCount + (char2 - VBase)) * TCount);
@ -888,11 +901,11 @@ to guarantee identifier closure.
}
return 0xFFFF; // no composition
}
static boolean isTrailingJamo(int cp) {
return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
}
private void fillFromFile(String version) {
DataInputStream dataIn = null;
String fileName = BIN_DIR + "UCD_Data" + version + ".bin";
@ -909,58 +922,58 @@ to guarantee identifier closure.
update = dataIn.readByte();
String foundVersion = major + "." + minor + "." + update;
if (format != BINARY_FORMAT || !version.equals(foundVersion)) {
throw new ChainException("Illegal data file format for {0}: {1}, {2}",
throw new ChainException("Illegal data file format for {0}: {1}, {2}",
new Object[]{version, new Byte(format), foundVersion});
}
date = dataIn.readLong();
size = uDataFileCount = dataIn.readInt();
boolean didJoiningHack = false;
// records
for (int i = 0; i < uDataFileCount; ++i) {
UData uData = new UData();
uData.readBytes(dataIn);
if (uData.codePoint == 0x2801) {
System.out.println("SPOT-CHECK: " + uData);
}
//T = Mc + (Cf - ZWNJ - ZWJ)
int cp = uData.codePoint;
byte old = uData.joiningType;
byte cat = uData.generalCategory;
//if (cp == 0x200D) {
// uData.joiningType = JT_C;
//} else
//} else
if (cp != 0x200D && cp != 0x200C && (cat == Mn || cat == Cf)) {
uData.joiningType = JT_T;
}
if (!didJoiningHack && uData.joiningType != old) {
System.out.println("HACK: Setting "
+ UCD_Names.LONG_JOINING_TYPE[uData.joiningType]
System.out.println("HACK: Setting "
+ UCD_Names.LONG_JOINING_TYPE[uData.joiningType]
+ ": " + Utility.hex(cp) + " " + uData.name);
didJoiningHack = true;
}
combiningClassSet.set(uData.combiningClass & 0xFF);
add(uData);
}
/*
if (update == -1) {
throw new ChainException("Data File truncated for ",
throw new ChainException("Data File truncated for ",
new Object[]{version}, e);
}
if (size != fileSize) {
throw new ChainException("Counts do not match: file {0}, records {1}",
throw new ChainException("Counts do not match: file {0}, records {1}",
new Object[]{new Integer(fileSize), new Integer(size)});
}
*/
// everything is ok!
this.version = version;
this.file = fileName;
//+ " " + new File(fileName).lastModified();
//+ " " + new File(fileName).lastModified();
} catch (IOException e) {
throw new ChainException("Can't read data file for {0}", new Object[]{version}, e);
} finally {
@ -971,4 +984,4 @@ to guarantee identifier closure.
}
}
}
}
}

View file

@ -1,16 +1,29 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
* $Date: 2001/08/31 00:29:50 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
final class UCD_Names implements UCD_Types {
static final String[] UNIFIED_PROPERTIES = {
"General Category (listing UnicodeData.txt, field 2: see UnicodeData.html)",
"Combining Class (listing UnicodeData.txt, field 3: see UnicodeData.html)",
"Bidi Class (listing UnicodeData.txt, field 4: see UnicodeData.html)",
"Decomposition Type (from UnicodeData.txt, field 5: see UnicodeData.html)",
"Numeric Type (from UnicodeData.txt, field 6/7/8: see UnicodeData.html)",
"Numeric Type (from UnicodeData.txt, field 6/7/8: see UnicodeData.html)",
"East Asian Width (listing EastAsianWidth.txt, field 1)",
"Line Break (listing LineBreak.txt, field 1)",
"Joining Type (listing ArabicShaping.txt, field 1).\r\n"
@ -21,13 +34,13 @@ final class UCD_Names implements UCD_Types {
"Script",
"Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)"
};
static final String[] SHORT_UNIFIED_PROPERTIES = {
"GeneralCategory",
"CombiningClass",
"BidiClass",
"DecompositionType",
"NumericType",
"NumericType",
"EastAsianWidth",
"LineBreak",
"JoiningType",
@ -36,13 +49,13 @@ final class UCD_Names implements UCD_Types {
"Script",
"Age"
};
static final String[] ABB_UNIFIED_PROPERTIES = {
"gc",
"cc",
"bc",
"dt",
"nt",
"nt",
"ea",
"lb",
"jt",
@ -51,11 +64,11 @@ final class UCD_Names implements UCD_Types {
"sc",
"Ag"
};
static final String[] BP = {
"BidiMirrored",
"CompositionExclusion",
"CompositionExclusion",
"White_Space",
"NonBreak",
"Bidi_Control",
@ -84,10 +97,10 @@ final class UCD_Names implements UCD_Types {
"Reserved_Cf_Code_Point",
"Deprecated",
};
static final String[] SHORT_BP = {
"BidiM",
"CExc",
"CExc",
"WhSp",
"NBrk",
"BdCon",
@ -116,11 +129,11 @@ final class UCD_Names implements UCD_Types {
"RCf",
"Dep",
};
/*
static final String[] BP_OLD = {
"BidiMirrored",
"CompositionExclusion",
"CompositionExclusion",
"White_space",
"Non_break",
"Bidi_Control",
@ -146,7 +159,7 @@ final class UCD_Names implements UCD_Types {
"UnifiedIdeograph"
};
*/
static final String[] DeletedProperties = {
"Private_Use",
"Composite",
@ -158,17 +171,17 @@ final class UCD_Names implements UCD_Types {
"Private_Use_High_Surrogate",
"Unassigned_Code_Point"
};
static final String[] YN_TABLE = {"N", "Y"};
static String[] EA = {
"N", "A", "H", "W", "F", "Na"
};
};
static String[] SHORT_EA = {
"Neutral", "Ambiguous", "Halfwidth", "Wide", "Fullwidth", "Narrow"
};
};
static final String[] LB = {
"XX", "OP", "CL", "QU", "GL", "NS", "EX", "SY",
"IS", "PR", "PO", "NU", "AL", "ID", "IN", "HY",
@ -177,11 +190,11 @@ final class UCD_Names implements UCD_Types {
};
static final String[] LONG_LB = {
"Unknown", "OpenPunctuation", "ClosePunctuation", "Quotation",
"Unknown", "OpenPunctuation", "ClosePunctuation", "Quotation",
"Glue", "Nonstarter", "Exclamation", "BreakSymbols",
"InfixNumeric", "PrefixNumeric", "PostfixNumeric",
"InfixNumeric", "PrefixNumeric", "PostfixNumeric",
"Numeric", "Alphabetic", "Ideographic", "Inseperable", "Hyphen",
"CombiningMark", "BreakBefore", "BreakAfter", "Space",
"CombiningMark", "BreakBefore", "BreakAfter", "Space",
"MandatoryBreak", "CarriageReturn", "LineFeed", "ContingentBreak",
"ComplexContext", "Ambiguous", "BreakBeforeAndAfter", "Surrogate", "ZWSpace"
};
@ -230,7 +243,7 @@ final class UCD_Names implements UCD_Types {
"DESERET",
"INHERITED",
};
public static final String[] ABB_SCRIPT = {
"Zyyy", // COMMON -- NOT A LETTER: NO EXACT CORRESPONDENCE IN 15924
"Latn", // LATIN
@ -275,17 +288,17 @@ final class UCD_Names implements UCD_Types {
"Dsrt",
"Qaai",
};
static final String[] AGE = {
"UNSPECIFIED",
"1.1",
"2.0", "2.1",
"3.0", "3.1"
};
static final String[] GC = {
"Cn", // = Other, Not Assigned 0
@ -328,7 +341,7 @@ final class UCD_Names implements UCD_Types {
"Pi", // = Punctuation, Initial quote 29 (may behave like Ps or Pe depending on usage)
"Pf" // = Punctuation, Final quote 30 (may behave like Ps or Pe dependingon usage)
};
static final String[] LONG_GC = {
"Unassigned", // = Other, Not Assigned 0
@ -372,7 +385,7 @@ final class UCD_Names implements UCD_Types {
"FinalPunctuation" // = Punctuation, Final quote 30 (may behave like Ps or Pe dependingon usage)
};
static String[] BC = {
"L", // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
@ -388,7 +401,7 @@ final class UCD_Names implements UCD_Types {
"ON", // Other Neutrals ; All other characters: punctuation, symbols
"<unused>", "BN", "NSM", "AL", "LRO", "RLO", "LRE", "RLE", "PDF"
};
static String[] LONG_BC = {
"LeftToRight", // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
"RightToLeft", // Right-Left; Arabic, Hebrew, and punctuation specific to those scripts
@ -401,13 +414,13 @@ final class UCD_Names implements UCD_Types {
"SegmentSeparator", // Segment Separator
"WhiteSpace", // Whitespace
"OtherNeutral", // Other Neutrals ; All other characters: punctuation, symbols
"<unused>",
"BoundaryNeutral", "NonspacingMark", "ArabicLetter",
"LeftToRightOverride",
"RightToLeftOverride", "LeftToRightEmbedding",
"<unused>",
"BoundaryNeutral", "NonspacingMark", "ArabicLetter",
"LeftToRightOverride",
"RightToLeftOverride", "LeftToRightEmbedding",
"RightToLeftEmbedding", "PopDirectionalFormat"
};
private static String[] CASE_TABLE = {
"LOWER", "TITLE", "UPPER", "UNCASED"
};
@ -432,7 +445,7 @@ final class UCD_Names implements UCD_Types {
"square", // A CJK squared font variant.
"fraction", // A vulgar fraction form.
};
static String[] SHORT_DT = {
"", // NONE
"ca", // CANONICAL
@ -453,7 +466,7 @@ final class UCD_Names implements UCD_Types {
"sq", // A CJK squared font variant.
"fr", // A vulgar fraction form.
};
static private String[] MIRRORED_TABLE = {
"N",
"Y"
@ -465,14 +478,14 @@ final class UCD_Names implements UCD_Types {
"digit",
"decimal",
};
static String[] SHORT_NT = {
"",
"nu",
"di",
"de",
};
static {
if (LIMIT_CATEGORY != GC.length) {
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: category");
@ -511,9 +524,9 @@ final class UCD_Names implements UCD_Types {
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: age");
}
}
public static byte ON = Utility.lookup("ON", BC);
public static String[] JOINING_TYPE = {
"C",
"D",
@ -584,7 +597,7 @@ final class UCD_Names implements UCD_Types {
"YUDH_HE",
"ZAIN",
};
public static String[] OLD_JOINING_GROUP = {
"<no shaping>",
"AIN",
@ -637,9 +650,9 @@ final class UCD_Names implements UCD_Types {
"YUDH HE",
"ZAIN",
};
static String[] JAMO_L_TABLE = {
// Value; Short Name; Unicode Name
"G", // U+1100; G; HANGUL CHOSEONG KIYEOK
@ -662,7 +675,7 @@ final class UCD_Names implements UCD_Types {
"P", // U+1111; P; HANGUL CHOSEONG PHIEUPH
"H" // U+1112; H; HANGUL CHOSEONG HIEUH
};
static String[] JAMO_V_TABLE = {
// Value; Short Name; Unicode Name
"A", // U+1161; A; HANGUL JUNGSEONG A
@ -687,7 +700,7 @@ final class UCD_Names implements UCD_Types {
"YI", // U+1174; YI; HANGUL JUNGSEONG YI
"I", // U+1175; I; HANGUL JUNGSEONG I
};
static String[] JAMO_T_TABLE = {
// Value; Short Name; Unicode Name
"", // filler, for LV syllable
@ -721,7 +734,7 @@ final class UCD_Names implements UCD_Types {
};
/*
static {
UNASSIGNED_INFO.code = '\uFFFF';

View file

@ -1,10 +1,23 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
* $Date: 2001/08/31 00:29:50 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
public interface UCD_Types {
public static final String DATA_DIR = "C:\\DATA\\";
public static final String BIN_DIR = DATA_DIR + "\\BIN\\";
public static final String GEN_DIR = DATA_DIR + "\\GEN\\";
static final byte BINARY_FORMAT = 5; // bumped if binary format of UCD changes
/*
@ -24,7 +37,7 @@ public interface UCD_Types {
13 Lower case equivalent mapping. Similar to 12. This field is informative.
14 Title case equivalent mapping. Similar to 12. This field is informative.
*/
// Binary ENUM Grouping
public static final int
CATEGORY = 0,
@ -41,9 +54,9 @@ public interface UCD_Types {
AGE = 0xB00,
NEXT_ENUM = 0x100,
LIMIT_ENUM = AGE + 0x100;
public static final int LIMIT_COMBINING_CLASS = 256;
// getCategory
public static final byte
UNASSIGNED = 0,
@ -78,7 +91,7 @@ public interface UCD_Types {
INITIAL_PUNCTUATION = 29,
FINAL_PUNCTUATION = 30,
LIMIT_CATEGORY = FINAL_PUNCTUATION+1,
// Unicode abbreviations
Lu = UPPERCASE_LETTER,
Ll = LOWERCASE_LETTER,
@ -110,7 +123,7 @@ public interface UCD_Types {
Sc = CURRENCY_SYMBOL,
Sk = MODIFIER_SYMBOL,
So = OTHER_SYMBOL;
static final int
LETTER_MASK = (1<<Lu) | (1<<Ll) | (1<<Lt) | (1<<Lm) | (1 << Lo),
MARK_MASK = (1<<Mn) | (1<<Me) | (1<<Mc),
@ -120,12 +133,12 @@ public interface UCD_Types {
PUNCTUATION_MASK = (1<<Pc) | (1<<Pd) | (1<<Ps) | (1<<Pe) | (1<<Po) | (1<<Pi) | (1<<Pf),
SYMBOL_MASK = (1<<Sm) | (1<<Sc) | (1<<Sk) | (1<<So),
UNASSIGNED_MASK = (1<<Cn);
// Binary Properties
public static final byte
BidiMirrored = 0,
CompositionExclusion = 1,
CompositionExclusion = 1,
White_space = 2,
Non_break = 3,
Bidi_Control = 4,
@ -154,11 +167,11 @@ public interface UCD_Types {
Reserved_Cf_Code_Point = 27,
Deprecated = 28,
LIMIT_BINARY_PROPERTIES = 29;
/*
static final int
BidiMirroredMask = 1<<BidiMirrored,
CompositionExclusionMask = 1<<CompositionExclusion,
CompositionExclusionMask = 1<<CompositionExclusion,
AlphabeticMask = 1<<Alphabetic,
Bidi_ControlMask = 1<<Bidi_Control,
DashMask = 1<<Dash,
@ -181,15 +194,15 @@ public interface UCD_Types {
// line break
public static final byte
LBXX = 0, LBOP = 1, LBCL = 2, LBQU = 3, LBGL = 4, LBNS = 5, LBEX = 6, LBSY = 7,
LBIS = 8, LBPR = 9, LBPO = 10, LBNU = 11, LBAL = 12, LBID = 13, LBIN = 14, LBHY = 15,
LBCM = 16, LBBB = 17, LBBA = 18, LBSP = 19, LBBK = 20, LBCR = 21, LBLF = 22, LBCB = 23,
LBIS = 8, LBPR = 9, LBPO = 10, LBNU = 11, LBAL = 12, LBID = 13, LBIN = 14, LBHY = 15,
LBCM = 16, LBBB = 17, LBBA = 18, LBSP = 19, LBBK = 20, LBCR = 21, LBLF = 22, LBCB = 23,
LBSA = 24, LBAI = 25, LBB2 = 26, LBSG = 27, LBZW = 28, LIMIT_LINE_BREAK = 29;
// east asian width
public static final byte
EAN = 0, EAA = 1, EAH = 2, EAW = 3, EAF = 4, EANa = 5,
EAN = 0, EAA = 1, EAH = 2, EAW = 3, EAF = 4, EANa = 5,
LIMIT_EAST_ASIAN_WIDTH = 6;
// bidi class
static final byte
BIDI_L = 0, // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
@ -214,7 +227,7 @@ public interface UCD_Types {
BIDI_RLE = 18,
BIDI_PDF = 19,
LIMIT_BIDI_CLASS = 20;
// decompositionType
static final byte NONE = 0,
CANONICAL = 1,
@ -239,7 +252,7 @@ public interface UCD_Types {
// mirrored type
static final byte NO = 0, YES = 1, MIRRORED_LIMIT = 2;
// for QuickCheck
static final byte QNO = 0, QMAYBE = 1, QYES = 2;
@ -251,7 +264,7 @@ public interface UCD_Types {
static final byte UNNORMALIZED = 0, C = 1, KC = 2, D = 3, KD = 4, FORM_LIMIT = 5;
// numericType
static final byte NUMERIC_NONE = 0, NUMERIC = 1, DIGIT = 2, DECIMAL = 3,
static final byte NUMERIC_NONE = 0, NUMERIC = 1, DIGIT = 2, DECIMAL = 3,
LIMIT_NUMERIC_TYPE = 4;
public static final byte // SCRIPT CODE
@ -263,7 +276,7 @@ public interface UCD_Types {
HEBREW_SCRIPT = 5,
ARABIC_SCRIPT = 6,
SYRIAC_SCRIPT = 7,
THAANA_SCRIPT = 8,
THAANA_SCRIPT = 8,
DEVANAGARI_SCRIPT = 9,
BENGALI_SCRIPT = 10,
GURMUKHI_SCRIPT = 11,
@ -298,8 +311,8 @@ public interface UCD_Types {
DESERET_SCRIPT = 40,
INHERITED_SCRIPT = 41,
LIMIT_SCRIPT = 42;
static final int
static final int
UNKNOWN = 0,
AGE10 = 1,
AGE20 = 2,
@ -307,9 +320,9 @@ public interface UCD_Types {
AGE30 = 4,
AGE31 = 5,
LIMIT_AGE = 6;
public static byte
JT_C = 0,
JT_D = 1,

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
* $Date: 2001/08/31 00:29:50 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
@ -16,24 +29,24 @@ class UData implements UCD_Types {
String fullCaseFolding;
String specialCasing = "";
String bidiMirror;
int codePoint = -1;
float numericValue = Float.NaN;
int binaryProperties; // bidiMirroring, compositionExclusions, PropList
byte generalCategory = Cn;
byte combiningClass = 0;
byte bidiClass = BIDI_ON;
byte decompositionType = NONE;
byte numericType = NUMERIC_NONE;
byte eastAsianWidth = EAN;
byte lineBreak = LBXX;
byte joiningType = JT_U;
byte joiningGroup = NO_SHAPING;
byte script = COMMON_SCRIPT;
byte age = 0;
static final UData UNASSIGNED = new UData();
//static final UData NONCHARACTER = new UData();
static {
@ -43,7 +56,7 @@ class UData implements UCD_Types {
= UNASSIGNED.simpleLowercase
= UNASSIGNED.simpleTitlecase = "";
UNASSIGNED.fleshOut();
/*NONCHARACTER.name = "<noncharacter>";
NONCHARACTER.decompositionMapping = NONCHARACTER.bidiMirror
= NONCHARACTER.simpleUppercase
@ -54,14 +67,14 @@ class UData implements UCD_Types {
NONCHARACTER.fleshOut();
*/
}
public UData (int codePoint) {
this.codePoint = codePoint;
}
public UData () {
}
public boolean equals(Object that) {
UData other = (UData) that;
if (!name.equals(other.name)) return false;
@ -92,87 +105,87 @@ class UData implements UCD_Types {
if (age != other.age) return false;
return true;
}
public void fleshOut() {
String codeValue = UTF32.valueOf32(codePoint);
if (decompositionMapping == null) decompositionMapping = codeValue;
if (bidiMirror == null) bidiMirror = codeValue;
if (simpleLowercase == null) simpleLowercase = codeValue;
if (simpleCaseFolding == null) simpleCaseFolding = simpleLowercase;
if (fullLowercase == null) fullLowercase = simpleLowercase;
if (fullCaseFolding == null) fullCaseFolding = fullLowercase;
if (simpleUppercase == null) simpleUppercase = codeValue;
if (simpleTitlecase == null) simpleTitlecase = codeValue;
if (fullUppercase == null) fullUppercase = simpleUppercase;
if (fullTitlecase == null) fullTitlecase = simpleTitlecase;
}
public void compact() {
fleshOut();
String codeValue = UTF32.valueOf32(codePoint);
if (fullTitlecase.equals(simpleTitlecase)) fullTitlecase = null;
if (fullUppercase.equals(simpleUppercase)) fullUppercase = null;
if (simpleTitlecase.equals(codeValue)) simpleTitlecase = null;
if (simpleUppercase.equals(codeValue)) simpleUppercase = null;
if (fullCaseFolding.equals(fullLowercase)) fullCaseFolding = null;
if (fullLowercase.equals(simpleLowercase)) fullLowercase = null;
if (simpleCaseFolding.equals(simpleLowercase)) simpleCaseFolding = null;
if (simpleLowercase.equals(codeValue)) simpleLowercase = null;
if (decompositionMapping.equals(codeValue)) decompositionMapping = null;
if (bidiMirror.equals(codeValue)) bidiMirror = null;
}
public void setBinaryProperties(int binaryProperties) {
this.binaryProperties = binaryProperties;
}
public boolean isLetter() {
return ((1<<generalCategory) & UCD_Types.LETTER_MASK) != 0;
}
public static void writeString(DataOutputStream os, String s) throws IOException {
if (s == null) {
os.writeByte(0);
os.writeByte(0);
} else {
os.writeByte(1);
os.writeUTF(s);
}
}
static final byte[] byteBuffer = new byte[256];
public static String readString(DataInputStream is) throws IOException {
int type = is.readUnsignedByte();
if (type == 0) return null;
return is.readUTF();
}
static final byte ABBREVIATED = 0, FULL = 1;
public String toString() {
return toString(FULL);
}
public String toString(byte style) {
boolean full = style == FULL;
StringBuffer result = new StringBuffer();
String s = UTF32.valueOf32(codePoint);
result.append("<e c='").append(Utility.quoteXML(codePoint)).append('\'');
result.append(" hx='").append(Utility.hex(codePoint)).append('\'');
if (full || script != COMMON_SCRIPT) result.append(" sn='").append(UCD_Names.SCRIPT[script]).append('\'');
result.append(" n='").append(Utility.quoteXML(name)).append("'\r\n");
int lastPos = result.length();
if (full || generalCategory != Lo) result.append(" gc='").append(UCD_Names.GC[generalCategory]).append('\'');
if (full || combiningClass != 0) result.append(" cc='").append(combiningClass & 0xFF).append('\'');
if (full || decompositionType != NONE) result.append(" dt='").append(UCD_Names.DT[decompositionType]).append('\'');
@ -180,21 +193,21 @@ class UData implements UCD_Types {
if (full || numericType != NUMERIC_NONE) result.append(" nt='").append(UCD_Names.NT[numericType]).append('\'');
if (full || !Double.isNaN(numericValue)) result.append(" nv='").append(numericValue).append('\'');
if (full || eastAsianWidth != EAN) result.append(" ea='").append(UCD_Names.EA[eastAsianWidth]).append('\'');
if (full || lineBreak != LBAL) result.append(" lb='").append(UCD_Names.LB[lineBreak]).append('\'');
if (full || joiningType != JT_U) result.append(" jt='").append(UCD_Names.JOINING_TYPE[joiningType]).append('\'');
if (full || joiningGroup != NO_SHAPING) result.append(" jg='").append(UCD_Names.JOINING_GROUP[joiningGroup]).append('\'');
if (full || age != 0) result.append(" ag='").append(UCD_Names.AGE[age]).append('\'');
if (full || bidiClass != BIDI_L) result.append(" bc='").append(UCD_Names.BC[bidiClass]).append('\'');
if (full || !bidiMirror.equals(s)) result.append(" bmg='").append(Utility.quoteXML(bidiMirror)).append('\'');
if (lastPos != result.length()) {
result.append("\r\n");
lastPos = result.length();
}
//String bp = "";
int bprops = binaryProperties;
for (int i = 0; i < LIMIT_BINARY_PROPERTIES; ++i) {
@ -204,26 +217,26 @@ class UData implements UCD_Types {
result.append("\r\n");
lastPos = result.length();
}
if (full || !fullLowercase.equals(s)) result.append(" lc='").append(Utility.quoteXML(fullLowercase)).append('\'');
if (full || !fullUppercase.equals(simpleUppercase)) result.append(" uc='").append(Utility.quoteXML(fullUppercase)).append('\'');
if (full || !fullTitlecase.equals(fullUppercase)) result.append(" tc='").append(Utility.quoteXML(fullTitlecase)).append('\'');
if (full || !fullCaseFolding.equals(fullLowercase)) result.append(" cf='").append(Utility.quoteXML(fullCaseFolding)).append('\'');
if (full || !simpleLowercase.equals(simpleLowercase)) result.append(" slc='").append(Utility.quoteXML(simpleLowercase)).append('\'');
if (full || !simpleUppercase.equals(simpleUppercase)) result.append(" suc='").append(Utility.quoteXML(simpleUppercase)).append('\'');
if (full || !simpleTitlecase.equals(simpleUppercase)) result.append(" stc='").append(Utility.quoteXML(simpleTitlecase)).append('\'');
if (full || !simpleCaseFolding.equals(simpleLowercase)) result.append(" sfc='").append(Utility.quoteXML(simpleCaseFolding)).append('\'');
if (full || !specialCasing.equals("")) result.append(" fsc='").append(Utility.quoteXML(specialCasing)).append('\'');
result.append("/>");
return result.toString();
}
public void writeBytes(DataOutputStream os) throws IOException {
compact();
os.writeInt(codePoint);
writeString(os, name);
writeString(os, decompositionMapping);
writeString(os, simpleUppercase);
@ -236,10 +249,10 @@ class UData implements UCD_Types {
writeString(os, fullCaseFolding);
writeString(os, specialCasing);
writeString(os, bidiMirror);
os.writeFloat(numericValue);
os.writeInt(binaryProperties);
os.writeByte(generalCategory);
os.writeByte(combiningClass);
os.writeByte(bidiClass);
@ -252,10 +265,10 @@ class UData implements UCD_Types {
os.writeByte(script);
os.writeByte(age);
}
public void readBytes(DataInputStream is) throws IOException {
codePoint = is.readInt();
name = readString(is);
decompositionMapping = readString(is);
simpleUppercase = readString(is);
@ -268,10 +281,10 @@ class UData implements UCD_Types {
fullCaseFolding = readString(is);
specialCasing = readString(is);
bidiMirror = readString(is);
numericValue = is.readFloat();
binaryProperties = is.readInt();
generalCategory = is.readByte();
combiningClass = is.readByte();
bidiClass = is.readByte();
@ -284,7 +297,7 @@ class UData implements UCD_Types {
script = is.readByte();
age = is.readByte();
fleshOut();
// HACK
/*
int bp = binaryProperties;
@ -300,7 +313,7 @@ class UData implements UCD_Types {
binaryProperties = bp;
}
*/
/*
if (generalCategory == Sm) {
if ((binaryProperties & Math_PropertyMask) != 0) {

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
* $Date: 2001/08/31 00:29:50 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.IOException;
@ -11,25 +24,25 @@ import java.io.*;
import com.ibm.text.utility.*;
public class VerifyUCD implements UCD_Types {
public static final String IDN_DIR = DATA_DIR + "\\IDN\\";
static String ucdVersion = "";
public static void main (String[] args) throws Exception {
for (int i = 0; i < args.length; ++i) {
String arg = args[i];
if (arg.charAt(0) == '#') return; // skip rest of line
Utility.fixDot();
System.out.println("Argument: " + args[i]);
if (arg.equalsIgnoreCase("all")) {
//checkCase();
checkCanonicalProperties();
CheckCaseFold();
checkAgainstUInfo();
} else if (arg.equalsIgnoreCase("build")) {
ConvertUCD.main(new String[]{ucdVersion});
} else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i];
@ -46,9 +59,9 @@ public class VerifyUCD implements UCD_Types {
else if (arg.equalsIgnoreCase("IdentifierTest")) IdentifierTest();
else if (arg.equalsIgnoreCase("GenerateData")) GenerateData.main(Utility.split(args[++i],','));
else if (arg.equalsIgnoreCase("BuildNames")) BuildNames.main(null);
else if (arg.equalsIgnoreCase("writeNormalizerTestSuite"))
else if (arg.equalsIgnoreCase("writeNormalizerTestSuite"))
GenerateData.writeNormalizerTestSuite("NormalizationTest-3.1.1d1.txt");
else {
System.out.println("Unknown option -- must be one of the following (case-insensitive)");
System.out.println("generateXML, checkCase, checkCanonicalProperties, CheckCaseFold,");
@ -58,7 +71,7 @@ public class VerifyUCD implements UCD_Types {
}
}
}
/*
System.out.println(ucd.toString(0x0387));
System.out.println(ucd.toString(0x00B7));
@ -70,7 +83,7 @@ public class VerifyUCD implements UCD_Types {
System.out.println(ucd.toString(0x0131));
System.out.println(ucd.toString(0x0345));
*/
static void checkAgainstOtherVersion(String otherVersion) {
ucd = UCD.make(ucdVersion);
UCD ucd2 = UCD.make(otherVersion);
@ -85,15 +98,15 @@ public class VerifyUCD implements UCD_Types {
}
}
}
static void generateXML() throws IOException {
ucd = UCD.make(ucdVersion);
String filename = "UCD.xml";
PrintWriter log = Utility.openPrintWriter(filename);
//log.println('\uFEFF');
log.println("<ucd>");
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
if (!ucd.isRepresented(cp)) continue;
@ -102,13 +115,13 @@ public class VerifyUCD implements UCD_Types {
}
log.println(ucd.toString(cp));
}
log.println("</ucd>");
log.close();
}
static final byte MIXED = (byte)(UNCASED + 1);
public static void checkCase() throws IOException {
Utility.fixDot();
System.out.println("checkCase");
@ -117,7 +130,7 @@ public class VerifyUCD implements UCD_Types {
System.out.println(ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE));
String fileName = "CaseDifferences.txt";
PrintWriter log = Utility.openPrintWriter(fileName);
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
if (!ucd.isRepresented(cp) || ucd.isPUA(cp)) continue;
@ -128,13 +141,13 @@ public class VerifyUCD implements UCD_Types {
String xu = ucd.getCase(x, FULL, UPPER);
String xl = ucd.getCase(x, FULL, LOWER);
String xt = ucd.getCase(x, FULL, TITLE);
byte caseCat = MIXED;
if (xu.equals(xl)) caseCat = UNCASED;
else if (x.equals(xl)) caseCat = LOWER;
else if (x.equals(xu)) caseCat = UPPER;
else if (x.equals(xt)) caseCat = TITLE;
byte cat = ucd.getCategory(cp);
boolean otherLower = ucd.getBinaryProperty(cp, Other_Lowercase);
boolean otherUpper = ucd.getBinaryProperty(cp, Other_Uppercase);
@ -142,15 +155,15 @@ public class VerifyUCD implements UCD_Types {
: (cat == Ll || otherLower) ? LOWER
: (cat == Lt) ? TITLE
: UNCASED;
if (caseCat != oldCaseCat) {
log.println(UTF32.valueOf32(cp)
+ "\t" + names[caseCat]
+ "\t" + names[caseCat]
+ "\t" + names[oldCaseCat]
+ "\t" + ucd.getCategoryID_fromIndex(cat)
+ "\t" + lowerNames[otherLower ? 1 : 0]
+ "\t" + upperNames[otherUpper ? 1 : 0]
+ "\t" + ucd.getCodeAndName(cp)
+ "\t" + ucd.getCategoryID_fromIndex(cat)
+ "\t" + lowerNames[otherLower ? 1 : 0]
+ "\t" + upperNames[otherUpper ? 1 : 0]
+ "\t" + ucd.getCodeAndName(cp)
+ "\t" + ucd.getCodeAndName(x)
+ "\t" + ucd.getCodeAndName(xu)
+ "\t" + ucd.getCodeAndName(xl)
@ -158,10 +171,10 @@ public class VerifyUCD implements UCD_Types {
);
}
}
log.close();
}
public static void checkCase2() throws IOException {
Utility.fixDot();
System.out.println("checkCase");
@ -170,52 +183,52 @@ public class VerifyUCD implements UCD_Types {
System.out.println(ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE));
String fileName = "CaseNormalizationDifferences.txt";
PrintWriter log = Utility.openPrintWriter(fileName);
log.println("Differences between case(normalize(cp)) and normalize(case(cp))");
log.println("u, l, t - upper, lower, title");
log.println("c, d - nfc, nfd");
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
if (!ucd.isRepresented(cp) || ucd.isPUA(cp)) continue;
if (cp == '\u3371') {
System.out.println("debug");
}
String x = UTF32.valueOf32(cp);
String ux = ucd.getCase(x, FULL, UPPER);
String lx = ucd.getCase(x, FULL, LOWER);
String tx = ucd.getCase(x, FULL, TITLE);
String dux = nfd.normalize(ux);
String dlx = nfd.normalize(lx);
String dtx = nfd.normalize(tx);
String cux = nfc.normalize(ux);
String clx = nfc.normalize(lx);
String ctx = nfc.normalize(tx);
String dx = nfd.normalize(cp);
String cx = nfc.normalize(cp);
String udx = ucd.getCase(dx, FULL, UPPER);
String ldx = ucd.getCase(dx, FULL, LOWER);
String tdx = ucd.getCase(dx, FULL, TITLE);
String ucx = ucd.getCase(cx, FULL, UPPER);
String lcx = ucd.getCase(cx, FULL, LOWER);
String tcx = ucd.getCase(cx, FULL, TITLE);
String dudx = nfd.normalize(udx);
String dldx = nfd.normalize(ldx);
String dtdx = nfd.normalize(tdx);
String cucx = nfc.normalize(ucx);
String clcx = nfc.normalize(lcx);
String ctcx = nfc.normalize(tcx);
if (!dux.equals(udx)
|| !dlx.equals(ldx)
|| !dtx.equals(tdx)
@ -236,7 +249,7 @@ public class VerifyUCD implements UCD_Types {
if (!tx.equals(ux)) log.println("\tt(cp):\t" + ucd.getCodeAndName(tx));
if (!x.equals(dx)) log.println("\td(cp):\t" + ucd.getCodeAndName(dx));
if (!x.equals(cx)) log.println("\tc(cp):\t" + ucd.getCodeAndName(cx));
if (!dux.equals(udx)) {
log.println();
log.println("\td(u(cp)):\t" + ucd.getCodeAndName(dux));
@ -252,7 +265,7 @@ public class VerifyUCD implements UCD_Types {
log.println("\td(t(cp)):\t" + ucd.getCodeAndName(dtx));
log.println("\tt(d(cp)):\t" + ucd.getCodeAndName(tdx));
}
if (!cux.equals(ucx)) {
log.println();
log.println("\tc(u(cp)):\t" + ucd.getCodeAndName(cux));
@ -268,9 +281,9 @@ public class VerifyUCD implements UCD_Types {
log.println("\tc(t(cp)):\t" + ucd.getCodeAndName(ctx));
log.println("\tt(c(cp)):\t" + ucd.getCodeAndName(tcx));
}
// ...........
if (!udx.equals(dudx)) {
log.println();
log.println("\tu(d(cp)):\t" + ucd.getCodeAndName(udx));
@ -286,7 +299,7 @@ public class VerifyUCD implements UCD_Types {
log.println("\tt(d(cp)):\t" + ucd.getCodeAndName(tdx));
log.println("\td(t(d(cp))):\t" + ucd.getCodeAndName(dtdx));
}
if (!ucx.equals(cucx)) {
log.println();
log.println("\tu(c(cp)):\t" + ucd.getCodeAndName(ucx));
@ -304,14 +317,14 @@ public class VerifyUCD implements UCD_Types {
}
}
}
log.close();
}
static final String names[] = {"LOWER", "TITLE", "UPPER", "(UNC)", "MIXED"};
static final String lowerNames[] = {"", "Other_Lower"};
static final String upperNames[] = {"", "Other_Upper"};
public static void CheckCaseFold() {
ucd = UCD.make(ucdVersion);
System.out.println("Checking Case Fold");
@ -320,10 +333,10 @@ public class VerifyUCD implements UCD_Types {
if (!ucd.isAssigned(cp) || ucd.isPUA(cp)) continue;
String fullTest = ucd.getCase(ucd.getCase(cp, FULL, UPPER), FULL, LOWER);
String simpleTest = ucd.getCase(ucd.getCase(cp, SIMPLE, UPPER), SIMPLE, LOWER);
String full = ucd.getCase(cp, FULL, FOLD);
String simple = ucd.getCase(cp, SIMPLE, FOLD);
boolean failed = false;
if (!full.equals(fullTest)) {
Utility.fixDot();
@ -342,29 +355,29 @@ public class VerifyUCD implements UCD_Types {
if (failed) System.out.println();
}
}
public static void VerifyIDN() throws IOException {
System.out.println("VerifyIDN");
ucd = UCD.make(ucdVersion);
initNormalizers();
System.out.println();
System.out.println("Checking Map");
System.out.println();
BitSet mappedOut = new BitSet();
int errorCount = verifyUTFMap(mappedOut);
BitSet unassigned = getIDNList("IDN-Unassigned.txt");
BitSet prohibited = getIDNList("IDN-Prohibited.txt");
BitSet guessSet = guessIDN();
System.out.println();
System.out.println("Checking Prohibited and Unassigned");
System.out.println();
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
if (mappedOut.get(cp)) continue;
boolean ucdUnassigned = !ucd.isAllocated(cp);
boolean idnUnassigned = unassigned.get(cp);
boolean guess = guessSet.get(cp);
@ -377,12 +390,12 @@ public class VerifyUCD implements UCD_Types {
showError("Not UCD Unassigned but IDN Unassigned: ", cp);
++errorCount;
}
if (idnProhibited && unassigned.get(cp)) {
showError("Both IDN Unassigned AND IDN Prohibited: ", cp);
++errorCount;
}
if (guess && !idnProhibited) {
showError("UCD ?prohibited? but not IDN Prohibited: ", cp);
++errorCount;
@ -390,72 +403,72 @@ public class VerifyUCD implements UCD_Types {
showError("Not UCD ?prohibited? but IDN Prohibited: ", cp);
++errorCount;
}
}
System.out.println();
System.out.println("Total Errors: " + errorCount);
}
static void showError(String description, int cp) {
System.out.println(description + ucd.getCodeAndName(cp) + " (" + ucd.getCategoryID(cp) + ")");
}
public static BitSet guessIDN() {
BitSet result = new BitSet();
for (int cp = 0; cp < 0x10FFFF; ++cp) {
int cat = ucd.getCategory(cp);
// 5.1 Currently-prohibited ASCII characters
if (cp < 0x80 && cp != '-' && !(cat == Lu || cat == Ll || cat == Nd)) result.set(cp);
// 5.2 Space characters
if (cat == Zs) result.set(cp);
// 5.3 Control characters
if (cat == Cc || cat == Zp || cat == Zl) result.set(cp);
// exclude those reserved for Cf
/*if (0x2060 <= cp && cp <= 0x206F) result.set(cp);
if (0xFFF0 <= cp && cp <= 0xFFFC) result.set(cp);
if (0xE0000 <= cp && cp <= 0xE0FFF) result.set(cp);
*/
// 5.4 Private use and replacement characters
if (cat == Co) result.set(cp);
if (cp == 0xFFFD) result.set(cp);
// 5.5 Non-character code points
if (ucd.getBinaryProperty(cp, Noncharacter_Code_Point)) result.set(cp);
// 5.6 Surrogate codes
if (cat == Cs) result.set(cp);
// 5.7 Inappropriate for plain text
if (cat == Cf) result.set(cp);
if (cp == 0xFFFC) result.set(cp);
// 5.8 Inappropriate for domain names
if (isIDS(cp)) result.set(cp);
// 5.9 Change display properties
// Cf, checked above
// 5.10 Inappropriate characters from common input mechanisms
if (cp == 0x3002) result.set(cp);
// 5.11 Tagging characters
// Cf, checked above
}
return result;
}
static boolean isIDS(int cp) { return 0x2FF0 <= cp && cp <= 0x2FFB; }
/*
5.1 Currently-prohibited ASCII characters
@ -610,8 +623,8 @@ The following characters are used for tagging text and are invisible.
E0001; LANGUAGE TAG
E0020-E007F; [TAGGING CHARACTERS]
*/
public static int verifyUTFMap(BitSet mappedOut) throws IOException {
int errorCount = 0;
BufferedReader input = new BufferedReader(new FileReader(IDN_DIR + "IDN-Mapping.txt"),32*1024);
@ -627,9 +640,9 @@ E0020-E007F; [TAGGING CHARACTERS]
Utility.fixDot();
System.out.println("//" + lineNumber + ": '" + line + "'");
}
if (line.length() == 0) continue;
int count = Utility.split(line,';',parts);
if (count != 3) throw new ChainException("Incorrect # of fields in IDN folding", null);
@ -650,12 +663,12 @@ E0020-E007F; [TAGGING CHARACTERS]
idnFold.put(key, value);
idnWhy.put(key, reason);
}
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
if (!ucd.isAssigned(cp) || ucd.isPUA(cp)) continue;
if (mappedOut.get(cp)) continue;
String key = UTF32.valueOf32(cp);
String value = (String)idnFold.get(key);
if (value == null) value = key;
@ -667,7 +680,7 @@ E0020-E007F; [TAGGING CHARACTERS]
if (c.equals(value)) continue;
Utility.fixDot();
System.out.println("Mismatch: " + ucd.getCodeAndName(cp));
System.out.println(" UCD Case Fold: <" + ucd.getCodeAndName(ucdFold) + ">");
System.out.println(" IDN Map [" + reason + "]: <" + ucd.getCodeAndName(value) + ">");
@ -679,7 +692,7 @@ E0020-E007F; [TAGGING CHARACTERS]
}
return errorCount;
}
static BitSet getIDNList(String file) throws IOException {
BufferedReader input = new BufferedReader(new FileReader(IDN_DIR + file),32*1024);
BitSet result = new BitSet();
@ -693,14 +706,14 @@ E0020-E007F; [TAGGING CHARACTERS]
Utility.fixDot();
System.out.println("//" + lineNumber + ": '" + line + "'");
}
if (line.length() == 0) continue;
int count = Utility.split(line,'-',parts);
if (count > 2) throw new ChainException("Incorrect # of fields in IDN list", null);
int start = Utility.codePointFromHex(parts[0]);
int end = count == 1 ? start : Utility.codePointFromHex(parts[1]);
for (int i = start; i <= end; ++i) {
result.set(i);
}
@ -710,12 +723,12 @@ E0020-E007F; [TAGGING CHARACTERS]
}
return result;
}
private static void IdentifierTest() {
String x = normalize(UTF32.valueOf32(0x10300), 4) ;
getCategoryID(x);
/*
Changes Category: U+10300 OLD ITALIC LETTER A
nfx_cp: U+D800 <surrogate-D800>
@ -724,7 +737,7 @@ E0020-E007F; [TAGGING CHARACTERS]
isIdentifierStart(cp, true): true
cat(cp): Lo
*/
for (int j = 0; j < 5; ++j) {
System.out.println();
System.out.println("Testing Identifier Closure for " + NAMES[j]);
@ -734,11 +747,11 @@ E0020-E007F; [TAGGING CHARACTERS]
if (!ucd.isAssigned(cp)) continue;
if (ucd.isPUA(cp)) continue;
if (!normalizationDiffers(cp, j)) continue;
if (cp == 0xFDFB || cp == 0x0140) {
System.out.println("debug point");
}
boolean norm;
boolean plain;
@ -750,15 +763,15 @@ E0020-E007F; [TAGGING CHARACTERS]
Utility.fixDot();
System.out.println("*Not Identifier: " + ucd.getCodeAndName(cp));
System.out.println(" nfx_x_cp: " + ucd.getCodeAndName(nfx_x_cp));
System.out.println(" isIdentifier(nfx_x_cp, true): " + norm);
System.out.println(" cat(nfx_x_cp): " + getCategoryID(nfx_x_cp));
System.out.println(" isIdentifier(x_cp, true): " + plain);
System.out.println(" cat(x_cp): " + getCategoryID(x_cp));
continue;
}
String nfx_cp = normalize(UTF32.valueOf32(cp), j);
plain = ucd.isIdentifierStart(cp, true);
norm = ucd.isIdentifier(nfx_cp, true);
@ -766,10 +779,10 @@ E0020-E007F; [TAGGING CHARACTERS]
Utility.fixDot();
System.out.println(" Changes Category: " + ucd.getCodeAndName(cp));
System.out.println(" nfx_cp: " + ucd.getCodeAndName(nfx_cp));
System.out.println(" isIdentifier(nfx_cp, true): " + norm);
System.out.println(" cat(nfx_cp): " + getCategoryID(nfx_cp));
System.out.println(" isIdentifierStart(cp, true): " + plain);
System.out.println(" cat(cp): " + ucd.getCategoryID(cp));
System.out.println();
@ -778,7 +791,7 @@ E0020-E007F; [TAGGING CHARACTERS]
}
}
}
static String getCategoryID(String s) {
if (UTF32.length32(s) == 1) return ucd.getCategoryID(UTF32.char32At(s, 0));
StringBuffer result = new StringBuffer();
@ -790,30 +803,30 @@ E0020-E007F; [TAGGING CHARACTERS]
}
return result.toString();
}
static String normalize(String s, int j) {
if (j < 4) return nf[j].normalize(s);
return ucd.getCase(s, FULL, FOLD);
}
static boolean normalizationDiffers(int cp, int j) {
if (j < 4) return nf[j].normalizationDiffers(cp);
return true;
}
private static Normalizer[] nf = new Normalizer[4];
private static Normalizer nfd, nfc, nfkd, nfkc;
static void initNormalizers() {
nfd = nf[0] = new Normalizer(Normalizer.NFD);
nfc = nf[1] = new Normalizer(Normalizer.NFC);
nfkd = nf[2] = new Normalizer(Normalizer.NFKD);
nfkc = nf[3] = new Normalizer(Normalizer.NFKC);
}
private static UCD ucd;
private static final String[] NAMES = {"NFD", "NFC", "NFKD", "NFKC", "Fold"};
private static void NFTest() {
initNormalizers();
for (int j = 0; j < 4; ++j) {
@ -834,10 +847,10 @@ E0020-E007F; [TAGGING CHARACTERS]
+ ", call: " + call + " " + ucd.getCodeAndName(i));
}
}
}
}
public static void checkScripts() {
ucd = UCD.make(ucdVersion);
for (int i = 0; i < 0x10FFFF; ++i) {
@ -847,21 +860,21 @@ E0020-E007F; [TAGGING CHARACTERS]
}
}
}
public static void checkAgainstUInfo() {
/*
ucd = UCD.make(ucdVersion);
UData x = new UData();
x.fleshOut();
System.out.println(ucd.toString(0x1E0A));
UInfo.init();
System.out.println("Cross-checking against old implementation");
System.out.println("Version: " + ucd.getVersion() + ", " + new Date(ucd.getDate()));
for (int i = 0; i <= 0xFFFF; ++i) {
Utility.dot(i);
if ((i & 0x0FFF) == 0) System.out.println("#" + Utility.hex(i));
try {
check(i, ucd.getName(i), UInfo.getName((char)i), "Name");
@ -872,12 +885,12 @@ E0020-E007F; [TAGGING CHARACTERS]
check(i, ucd.getDecompositionType(i), UInfo.getDecompositionType((char)i), UCD_Names.DT, "DecompositionType");
check(i, ucd.getNumericValue(i), UInfo.getNumeric((char)i), "NumericValue");
check(i, ucd.getNumericType(i), UInfo.getNumericType((char)i), UCD_Names.NT, "NumericType");
check(i, ucd.getCase(i, SIMPLE, LOWER), UInfo.getLowercase((char)i), "SimpleLowercase");
check(i, ucd.getCase(i, SIMPLE, UPPER), UInfo.getUppercase((char)i), "SimpleUppercase");
check(i, ucd.getCase(i, SIMPLE, TITLE), UInfo.getTitlecase((char)i), "SimpleTitlecase");
//check(i, ucd.getSimpleCaseFolding(i), UInfo.getSimpleCaseFolding((char)i));
if (ucd.getSpecialCase(i).length() == 0) { // NORMAL
check(i, ucd.getCase(i, FULL, LOWER), UInfo.toLowercase((char)i, ""), "FullLowercase");
check(i, ucd.getCase(i, FULL, UPPER), UInfo.toUppercase((char)i, ""), "FullUppercase");
@ -888,18 +901,18 @@ E0020-E007F; [TAGGING CHARACTERS]
check(i, ucd.getCase(i, SIMPLE, TITLE), UInfo.toTitlecase((char)i, ""), "FullTitlecase");
}
// check(i, ucd.getFullCaseFolding(i), UInfo.getFullCaseFolding((char)i));
check(i, ucd.getSpecialCase(i).toUpperCase(), UInfo.getCaseCondition((char)i).toUpperCase(), "SpecialCase");
check(i, ucd.getLineBreak(i), UInfo.getLineBreakType((char)i), UCD_Names.LB, "LineBreak");
check(i, ucd.getEastAsianWidth(i), UInfo.getEastAsianWidthType((char)i), UCD_Names.EA, "EastAsian");
int props = ucd.getBinaryProperties(i);
check(i, (props>>BidiMirrored) & 1, UInfo.getMirrored((char)i), UCD_Names.YN_TABLE, "BidiMirroring");
check(i, (props>>CompositionExclusion) & 1, UInfo.isCompositionExcluded((char)i)?1:0, UCD_Names.YN_TABLE, "Comp-Exclusion");
} catch (Exception e) {
Utility.fixDot();
System.out.println("Error: " + Utility.hex(i) + " " + e.getClass().getName() + e.getMessage());
e.printStackTrace();
}
@ -907,38 +920,38 @@ E0020-E007F; [TAGGING CHARACTERS]
*/
}
public static void check(int cp, boolean x, boolean y, String[] names, String type) {
check(cp, x ? 1 : 0, y ? 1 : 0, names, type);
}
public static void check(int cp, int x, int y, String[] names, String type) {
if (x == y) return;
showLast(cp);
Utility.fixDot();
System.out.println(" " + type + ": "
+ Utility.getName(x, names) + " (" + x + ") " + " != "
System.out.println(" " + type + ": "
+ Utility.getName(x, names) + " (" + x + ") " + " != "
+ Utility.getName(y, names) + " (" + y + ") ") ;
}
public static void check(int cp, int x, int y, String type) {
if (x == y) return;
showLast(cp);
Utility.fixDot();
System.out.println(" " + type + ": " + x + " != " + y) ;
}
public static void check(int cp, float x, float y, String type) {
if (!(x > y) && !(x < y)) return; // funny syntax to catch NaN
showLast(cp);
Utility.fixDot();
System.out.println(" " + type + ": " + x + " != " + y) ;
}
public static void check(int cp, String x, String y, String type) {
if (x != null && x.equals(y)) return;
if (x != null && y != null
&& x.length() > 0 && y.length() > 0
if (x != null && y != null
&& x.length() > 0 && y.length() > 0
&& x.charAt(0) == '<' && y.charAt(0) == '<') {
if (x.startsWith("<unassigned") && y.equals("<reserved>")) return;
if (y.equals("<control>")) return;
@ -949,11 +962,11 @@ E0020-E007F; [TAGGING CHARACTERS]
Utility.fixDot();
System.out.println(" " + type + ": " + Utility.quoteJavaString(x) + " != " + Utility.quoteJavaString(y));
}
static int lastShowed = -1;
static boolean showCanonicalDecomposition = false;
static void showLast(int cp) {
if (lastShowed != cp) {
Utility.fixDot();
@ -967,14 +980,14 @@ E0020-E007F; [TAGGING CHARACTERS]
lastShowed = cp;
}
}
public static void test1() {
ucd = UCD.make(ucdVersion);
for (int i = 0x19; i < 0x10FFFF; ++i) {
System.out.println(Utility.hex(i) + " " + Utility.quoteJavaString(ucd.getName(i)));
System.out.print(" "
+ ", gc=" + ucd.getCategoryID(i)
+ ", bc=" + ucd.getBidiClassID(i)
@ -989,7 +1002,7 @@ E0020-E007F; [TAGGING CHARACTERS]
if (ucd.getBinaryProperty(i,j)) System.out.print(", " + UCD_Names.BP[j]);
}
System.out.println();
System.out.println(" "
+ ", dm=" + Utility.quoteJavaString(ucd.getDecompositionMapping(i))
+ ", slc=" + Utility.quoteJavaString(ucd.getCase(i, SIMPLE, LOWER))
@ -1000,15 +1013,15 @@ E0020-E007F; [TAGGING CHARACTERS]
+ ", fuc=" + Utility.quoteJavaString(ucd.getCase(i, FULL, UPPER))
+ ", sc=" + Utility.quoteJavaString(ucd.getSpecialCase(i))
);
if (i > 0x180) i = 3 * i / 2;
}
}
static void checkCanonicalProperties() {
ucd = UCD.make(ucdVersion);
System.out.println(ucd.toString(0x1E0A));
System.out.println("Cross-checking canonical equivalence");
System.out.println("Version: " + ucd.getVersion() + ", " + new Date(ucd.getDate()));
showCanonicalDecomposition = true;
@ -1020,7 +1033,7 @@ E0020-E007F; [TAGGING CHARACTERS]
}
byte type = ucd.getDecompositionType(i);
if (type != CANONICAL) continue;
String s = ucd.getDecompositionMapping(i);
int slen = UTF32.length32(s);
int j = UTF32.char32At(s, 0);
@ -1031,16 +1044,16 @@ E0020-E007F; [TAGGING CHARACTERS]
check(i, ucd.getBidiClass(i), ucd.getBidiClass(j), UCD_Names.BC, "BidiClass");
check(i, ucd.getNumericValue(i), ucd.getNumericValue(j), "NumericValue");
check(i, ucd.getNumericType(i), ucd.getNumericType(j), UCD_Names.NT, "NumericType");
if (false) {
for (byte k = LOWER; k <= FOLD; ++k) {
check(i, ucd.getCase(i, SIMPLE, k), ucd.getCase(j, SIMPLE, k), "Simple("+k+")");
check(i, ucd.getCase(i, FULL, k), ucd.getCase(j, FULL, k), "Full("+k+")");
}
}
if (slen == 1) check(i, ucd.getSpecialCase(i), ucd.getSpecialCase(j), "SpecialCase");
for (byte k = 0; k < LIMIT_BINARY_PROPERTIES; ++k) {
if (k == Hex_Digit) continue;
if (k == Radical) continue;
@ -1052,12 +1065,12 @@ E0020-E007F; [TAGGING CHARACTERS]
//check(i, ucd.getLineBreak(i), ucd.getLineBreak(j), UCD_Names.LB, "LineBreak");
//check(i, ucd.getEastAsianWidth(i), ucd.getEastAsianWidth(j), UCD_Names.EA, "EastAsian");
}
} catch (Exception e) {
System.out.println("Error: " + Utility.hex(i) + " " + e.getClass().getName() + e.getMessage());
e.printStackTrace();
}
}
}
}

View file

@ -1,3 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java,v $
* $Date: 2001/08/31 00:29:50 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
@ -7,14 +20,14 @@ import com.ibm.text.utility.*;
public class WriteJavaScriptInfo {
/* TODO: fix enumeration of compositions
static public void writeJavascriptInfo() throws IOException {
System.err.println("Writing Javascript data");
UCD ucd = UCD.make();
Normalizer normKD = new Normalizer(Normalizer.NFKD);
Normalizer normD = new Normalizer(Normalizer.NFD);
PrintWriter log = new PrintWriter(new FileOutputStream("Normalization_data.js"));
int count = 0;
int datasize = 0;
int max = 0;
@ -22,7 +35,7 @@ public class WriteJavaScriptInfo {
log.println("var KD = new Object(); // NFKD compatibility decomposition mappings");
log.println("// NOTE: Hangul is done in code!");
CompactShortArray csa = new CompactShortArray((short)0);
for (char c = 0; c < 0xFFFF; ++c) {
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
if (0xAC00 <= c && c <= 0xD7A3) continue;
@ -50,7 +63,7 @@ public class WriteJavaScriptInfo {
log.println("var D = new Object(); // NFD canonical decomposition mappings");
log.println("// NOTE: Hangul is done in code!");
csa = new CompactShortArray((short)0);
for (char c = 0; c < 0xFFFF; ++c) {
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
if (0xAC00 <= c && c <= 0xD7A3) continue;
@ -64,7 +77,7 @@ public class WriteJavaScriptInfo {
}
}
csa.compact();
log.println("// " + count + " NFD mappings total");
log.println("// " + datasize + " total characters of results");
log.println("// " + max + " string length, maximum");
@ -75,13 +88,13 @@ public class WriteJavaScriptInfo {
datasize = 0;
log.println("var CC = new Object(); // canonical class mappings");
CompactByteArray cba = new CompactByteArray();
for (char c = 0; c < 0xFFFF; ++c) {
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
int canClass = normKD.getCanonicalClass(c);
if (canClass != 0) {
++count;
log.println("\t CC[0x" + Utility.hex(c) + "]=" + canClass + ";");
}
}
@ -89,7 +102,7 @@ public class WriteJavaScriptInfo {
log.println("// " + count + " canonical class mappings total");
log.println("// " + cba.storage() + " trie length");
log.println();
count = 0;
datasize = 0;
log.println("var C = new Object(); // composition mappings");
@ -105,11 +118,11 @@ public class WriteJavaScriptInfo {
}
log.println("// " + count + " composition mappings total");
log.println();
log.close();
System.err.println("Done writing Javascript data");
}
*/
}