mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
Forgot copyrights
X-SVN-Rev: 5643
This commit is contained in:
parent
4c3e3b8dff
commit
7260c9a6a4
20 changed files with 1310 additions and 1050 deletions
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/BuildNames.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -10,35 +23,35 @@ import com.ibm.text.utility.*;
|
|||
|
||||
|
||||
public class BuildNames implements UCD_Types {
|
||||
|
||||
|
||||
static final boolean DEBUG = true;
|
||||
|
||||
|
||||
static UCD ucd;
|
||||
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
|
||||
ucd = UCD.make();
|
||||
|
||||
|
||||
collectWords();
|
||||
}
|
||||
|
||||
|
||||
static Set words = new TreeSet(new LengthFirstComparator());
|
||||
static Set lines = new TreeSet(new LengthFirstComparator());
|
||||
static int[] letters = new int[128];
|
||||
|
||||
|
||||
static void stash(String word) {
|
||||
words.add(word);
|
||||
for (int i = 0; i < word.length(); ++i) {
|
||||
letters[word.charAt(i)]++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static String transform(String line) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
boolean changed = false;
|
||||
for (int i = 0; i < line.length(); ++i) {
|
||||
char c = line.charAt(i);
|
||||
|
||||
|
||||
if (c == '-' || c == '<' || c == '>') {
|
||||
if (result.length() > 0 && result.charAt(result.length()-1) != ' ') result.append(' ');
|
||||
result.append(c);
|
||||
|
@ -46,7 +59,7 @@ public class BuildNames implements UCD_Types {
|
|||
changed = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
if ('a' <= c && c <= 'z') {
|
||||
result.append((char)(c - 'a' + 'A'));
|
||||
changed = true;
|
||||
|
@ -56,15 +69,15 @@ public class BuildNames implements UCD_Types {
|
|||
result.append('*').append((char)(c - '0' + 'A'));
|
||||
changed = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
result.append(c);
|
||||
}
|
||||
if (!changed) return line;
|
||||
return result.toString().trim();
|
||||
}
|
||||
|
||||
|
||||
static void collectWords() throws IOException {
|
||||
|
||||
|
||||
System.out.println("Gathering data");
|
||||
//Counter counter = new Counter();
|
||||
String[] parts = new String[100];
|
||||
|
@ -74,23 +87,23 @@ public class BuildNames implements UCD_Types {
|
|||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if (ucd.hasComputableName(i)) continue;
|
||||
String name = transform(ucd.getName(i));
|
||||
|
||||
|
||||
|
||||
|
||||
sum += name.length();
|
||||
used++;
|
||||
|
||||
|
||||
// replace numbers & letters
|
||||
|
||||
|
||||
int len = Utility.split(name, ' ', parts);
|
||||
for (int j = 0; j < len; ++j) {
|
||||
stash(parts[j]);
|
||||
}
|
||||
|
||||
|
||||
lines.add(name);
|
||||
}
|
||||
System.out.println("Overhead: " + (lastLink - used) + ", " + ((lastLink - used) * 100 / used) + "%");
|
||||
System.out.println("Strings: " + sum + ", " + (lastLink*4));
|
||||
|
||||
|
||||
System.out.println();
|
||||
System.out.println("Compacting Words");
|
||||
System.out.println();
|
||||
|
@ -104,7 +117,7 @@ public class BuildNames implements UCD_Types {
|
|||
if (false || !goesRound) System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")"
|
||||
+ (goesRound ? ": NO RT: '" + round + "'" : ""));
|
||||
}
|
||||
|
||||
|
||||
System.out.println();
|
||||
System.out.println("Compacting Lines");
|
||||
System.out.println();
|
||||
|
@ -122,18 +135,18 @@ public class BuildNames implements UCD_Types {
|
|||
if (false || !goesRound) System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")"
|
||||
+ (!goesRound ? ": NO RT: '" + round + "'" : ""));
|
||||
}
|
||||
|
||||
|
||||
/*System.out.println("Printing Compact Forms");
|
||||
for (int i = 0; i < CompactName.lastToken; ++i) {
|
||||
String s = CompactName.stringFromToken(i);
|
||||
System.out.println(i + ": '" + s + "'");
|
||||
}*/
|
||||
|
||||
|
||||
System.out.println("Strings: " + sum
|
||||
+ ", " + (CompactName.spacedMinimum*4)
|
||||
+ ", " + (CompactName.lastToken*4)
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
/*
|
||||
Set stuff = new TreeSet();
|
||||
|
@ -142,7 +155,7 @@ public class BuildNames implements UCD_Types {
|
|||
stuff.add(new Integer((letters[i] << 8) + i));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
it = stuff.iterator();
|
||||
while (it.hasNext()) {
|
||||
int in = ((Integer) it.next()).intValue();
|
||||
|
@ -153,13 +166,13 @@ public class BuildNames implements UCD_Types {
|
|||
System.out.println("\tNo Round Trip: '" + rname + "'");
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
static Map stringToInt = new HashMap();
|
||||
static Map intToString = new HashMap();
|
||||
|
||||
|
||||
static final int[] remap = new int['Z'+1];
|
||||
static final int maxToken;
|
||||
|
||||
|
||||
static {
|
||||
int counter = 1;
|
||||
remap[' '] = counter++;
|
||||
|
@ -174,7 +187,7 @@ public class BuildNames implements UCD_Types {
|
|||
}
|
||||
maxToken = counter;
|
||||
}
|
||||
|
||||
|
||||
static final String[] unmap = new String[maxToken];
|
||||
static {
|
||||
unmap[0] = "";
|
||||
|
@ -183,16 +196,16 @@ public class BuildNames implements UCD_Types {
|
|||
if (x != 0) unmap[x] = String.valueOf((char)i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int[] links = new int[40000];
|
||||
static final int linkStart = 0;
|
||||
static int lastLink = 0;
|
||||
static final int LITERAL_BOUND = 0x7FFF - maxToken * maxToken;
|
||||
|
||||
|
||||
static boolean isLiteral(int i) {
|
||||
return (i & 0x7FFF) > LITERAL_BOUND;
|
||||
}
|
||||
|
||||
|
||||
static String lookup(int i) {
|
||||
String result;
|
||||
boolean trailingSpace = false;
|
||||
|
@ -216,7 +229,7 @@ public class BuildNames implements UCD_Types {
|
|||
if (DEBUG) System.out.println("token: " + i + " => '" + result + "'");
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static int getInt(String s) {
|
||||
if (s.length() < 3) {
|
||||
if (s.length() == 0) return 0;
|
||||
|
@ -228,14 +241,14 @@ public class BuildNames implements UCD_Types {
|
|||
if (in == null) return -1;
|
||||
return ((Integer)in).intValue();
|
||||
}
|
||||
|
||||
|
||||
static int putString(String s, int lead, int trail) {
|
||||
Object in = stringToInt.get(s);
|
||||
if (in != null) throw new IllegalArgumentException();
|
||||
int value = (lead << 16) + (trail & 0xFFFF);
|
||||
int result = lastLink;
|
||||
links[lastLink++] = value;
|
||||
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("'" + s + "', link[" + result + "] = lead: " + lead + ", trail: " + trail);
|
||||
String roundTrip = lookup(result);
|
||||
|
@ -246,7 +259,7 @@ public class BuildNames implements UCD_Types {
|
|||
stringToInt.put(s, new Integer(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// s cannot have a trailing space. Must be <,>,-,SPACE,0-9,A-Z
|
||||
static int addString(String s) {
|
||||
int result = getInt(s);
|
||||
|
@ -259,9 +272,9 @@ public class BuildNames implements UCD_Types {
|
|||
int lastSpace = -1;
|
||||
int spaceBits;
|
||||
int endOfFirst;
|
||||
|
||||
|
||||
// invariant. We break after a space if there is one.
|
||||
|
||||
|
||||
for (int i = 1; i < limit; ++i) {
|
||||
char c = s.charAt(i-1);
|
||||
spaceBits = 0;
|
||||
|
@ -271,7 +284,7 @@ public class BuildNames implements UCD_Types {
|
|||
endOfFirst--;
|
||||
spaceBits = 0x8000;
|
||||
}
|
||||
|
||||
|
||||
String firstPart = s.substring(0, endOfFirst);
|
||||
String lastPart = s.substring(i);
|
||||
if (firstPart.equals("<START OF ")) {
|
||||
|
@ -292,7 +305,7 @@ public class BuildNames implements UCD_Types {
|
|||
if (i > bestSpaceLen && c == ' ') {
|
||||
bestSpaceLen = i;
|
||||
bestSpace_i = i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
int end_i = s.length() - i;
|
||||
if (!isLiteral(trail)) {
|
||||
|
@ -310,9 +323,9 @@ public class BuildNames implements UCD_Types {
|
|||
bestLen = bestSpaceLen;
|
||||
best_i = bestSpace_i;
|
||||
}
|
||||
|
||||
|
||||
spaceBits = 0;
|
||||
|
||||
|
||||
if (bestLen > 0) { // if one matches, recurse -- and return pair
|
||||
endOfFirst = best_i;
|
||||
if (lastSpace > 0) {
|
||||
|
@ -335,8 +348,8 @@ public class BuildNames implements UCD_Types {
|
|||
}
|
||||
// otherwise, we failed to find anything. Then break before the last word, if there is one
|
||||
// otherwise break in the middle (but at even value)
|
||||
|
||||
|
||||
|
||||
|
||||
if (lastSpace >= 0) {
|
||||
best_i = lastSpace;
|
||||
endOfFirst = lastSpace - 1;
|
||||
|
@ -350,7 +363,7 @@ public class BuildNames implements UCD_Types {
|
|||
+ "' # '" + lastPart + "' FALLBACK");
|
||||
return putString(s, spaceBits | addString(firstPart), addString(lastPart));
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
static int addCompression(String s) {
|
||||
Object in = stringToInt.get(s);
|
||||
|
@ -363,7 +376,7 @@ public class BuildNames implements UCD_Types {
|
|||
if (c == ' ' || c == '-') {
|
||||
Object pos1 = stringToInt.get(s.substring(0,i+1));
|
||||
//Object pos23 = stringToInt.get(s..substring(i));
|
||||
|
||||
|
||||
|
||||
if (pos2 >= 0 && pos3 >= 0) {
|
||||
fullToCompressed.put(value, new Integer(index + reserved));
|
||||
|
@ -381,11 +394,11 @@ public class BuildNames implements UCD_Types {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void gatherData() throws IOException {
|
||||
System.out.println("Gathering data");
|
||||
Counter counter = new Counter();
|
||||
|
@ -415,29 +428,29 @@ public class BuildNames implements UCD_Types {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
System.out.println("Sorting data");
|
||||
Map m = counter.extract();
|
||||
|
||||
|
||||
System.out.println("Printing data");
|
||||
|
||||
|
||||
PrintWriter log = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(GEN_DIR + "NameCompression.txt")),
|
||||
32*1024));
|
||||
|
||||
|
||||
log.println("total: " + total);
|
||||
|
||||
|
||||
Iterator it = m.keySet().iterator();
|
||||
|
||||
|
||||
String mondo = "";
|
||||
int i = 0;
|
||||
int strTotal = 0;
|
||||
|
||||
|
||||
int index = 0;
|
||||
Map fullToCompressed = new HashMap();
|
||||
|
||||
|
||||
String mondoIndex = "";
|
||||
|
||||
main:
|
||||
|
@ -448,20 +461,20 @@ public class BuildNames implements UCD_Types {
|
|||
String value = (String)m.get(key);
|
||||
log.println(i++ + ": " + key + ": \"" + value + "\"");
|
||||
strTotal += value.length();
|
||||
|
||||
|
||||
|
||||
|
||||
// first 128 are the highest frequency, inc. space
|
||||
|
||||
|
||||
if (index < 128 - SINGLES) {
|
||||
mondo += value;
|
||||
fullToCompressed.put(value, new String((char)(index + reserved)));
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
int pos = mondo.indexOf(value);
|
||||
if (pos >= 0) {
|
||||
// try splitting!
|
||||
|
||||
|
||||
int bestBreak = -1;
|
||||
boolean pickFirst = false;
|
||||
if (value.length() > 2) for (int k = 1; k < value.length()-1; ++k) {
|
||||
|
@ -493,22 +506,22 @@ public class BuildNames implements UCD_Types {
|
|||
mondo += value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// high bit on, means 2 bytes, look in array
|
||||
}
|
||||
|
||||
|
||||
log.println("strTotal: " + strTotal);
|
||||
log.println("mondo: " + mondo.length());
|
||||
|
||||
|
||||
int k = 80;
|
||||
for (; k < mondo.length(); k += 80) {
|
||||
log.println(mondo.substring(k-80, k));
|
||||
}
|
||||
log.println(mondo.substring(k-80)); // last line
|
||||
|
||||
|
||||
log.close();
|
||||
}
|
||||
|
||||
|
||||
static int indexOf(StringBuffer target, String source) {
|
||||
int targetLen = target.length() - source.length();
|
||||
main:
|
||||
|
@ -520,10 +533,10 @@ public class BuildNames implements UCD_Types {
|
|||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
static final int SINGLES = 26 + 10 + 2;
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
static String decode(int x) {
|
||||
if (x < SINGLES) {
|
||||
|
@ -533,6 +546,6 @@ public class BuildNames implements UCD_Types {
|
|||
return " ";
|
||||
}
|
||||
if (x < binaryLimit) {
|
||||
x =
|
||||
x =
|
||||
*/
|
||||
}
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CompactName.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -6,28 +19,28 @@ import java.io.*;
|
|||
import java.text.*;
|
||||
|
||||
public class CompactName {
|
||||
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
|
||||
int test = tokenFromString("ABZ");
|
||||
String ss = stringFromToken(test);
|
||||
System.out.println(ss);
|
||||
|
||||
|
||||
CompactName.addWord("ABSOLUTEISM");
|
||||
|
||||
|
||||
for (int i = 0; i < CompactName.lastToken; ++i) {
|
||||
String s = CompactName.stringFromToken(i);
|
||||
System.out.println(s);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static final char[] compactMap = new char[128];
|
||||
static final char[] compactUnmap = new char[128];
|
||||
|
||||
|
||||
static {
|
||||
char counter = 0;
|
||||
compactMap[0] = counter++;
|
||||
|
@ -38,14 +51,14 @@ public class CompactName {
|
|||
compactMap['>'] = counter++;
|
||||
compactMap['<'] = counter++;
|
||||
compactMap['*'] = counter++;
|
||||
|
||||
|
||||
compactUnmap[0] = 0;
|
||||
for (char i = 0; i < compactUnmap.length; ++i) {
|
||||
int x = compactMap[i];
|
||||
if (x != 0) compactUnmap[x] = i;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
static String expand(String s) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
|
@ -58,7 +71,7 @@ public class CompactName {
|
|||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
static String compact(String s) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
|
@ -72,27 +85,27 @@ public class CompactName {
|
|||
return result.toString();
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
static Map string_token = new HashMap();
|
||||
static Map token_string = new HashMap();
|
||||
|
||||
|
||||
static int[] tokenList = new int[40000];
|
||||
static final int tokenStart = 0;
|
||||
static int lastToken = 0;
|
||||
|
||||
|
||||
static int spacedMinimum = Integer.MAX_VALUE;
|
||||
|
||||
|
||||
static boolean isLiteral(int i) {
|
||||
return (i & 0x8000) != 0;
|
||||
}
|
||||
|
||||
|
||||
static int addTokenForString(String s, int lead, int trail) {
|
||||
Object in = string_token.get(s);
|
||||
if (in != null) throw new IllegalArgumentException();
|
||||
int value = (lead << 16) + (trail & 0xFFFF);
|
||||
int result = lastToken;
|
||||
tokenList[lastToken++] = value;
|
||||
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("'" + s + "', tokenList[" + result + "] = lead: " + lead + ", trail: " + trail);
|
||||
String roundTrip = stringFromToken(result);
|
||||
|
@ -103,7 +116,7 @@ public class CompactName {
|
|||
string_token.put(s, new Integer(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static String stringFromToken(int i) {
|
||||
String result;
|
||||
if ((i & 0x8000) != 0) {
|
||||
|
@ -125,7 +138,7 @@ public class CompactName {
|
|||
if (DEBUG) System.out.println("token: " + i + " => '" + result + "'");
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static int tokenFromString(String s) {
|
||||
if (s.length() <= 3) {
|
||||
int first = compactMap[s.charAt(0)];
|
||||
|
@ -137,17 +150,17 @@ public class CompactName {
|
|||
if (in == null) return -1;
|
||||
return ((Integer)in).intValue();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static int addWord(String s) {
|
||||
|
||||
|
||||
int result = tokenFromString(s);
|
||||
if (result != -1) return result;
|
||||
int bestLen = 0;
|
||||
int best_i = 0;
|
||||
|
||||
|
||||
int limit = s.length() - 1;
|
||||
|
||||
|
||||
for (int i = limit; i >= 1; --i) {
|
||||
|
||||
String firstPart = s.substring(0, i);
|
||||
|
@ -155,7 +168,7 @@ public class CompactName {
|
|||
|
||||
int lead = tokenFromString(firstPart);
|
||||
int trail = tokenFromString(lastPart);
|
||||
|
||||
|
||||
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
|
||||
if (DEBUG) show(s, firstPart, lastPart, "MATCH BOTH");
|
||||
return addTokenForString(s, lead, trail);
|
||||
|
@ -187,34 +200,34 @@ public class CompactName {
|
|||
return addTokenForString(s, addWord(firstPart), trail);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// break at multiple of 3
|
||||
|
||||
|
||||
best_i = ((s.length() + 1) / 6) * 3;
|
||||
String firstPart = s.substring(0, best_i);
|
||||
String lastPart = s.substring(best_i);
|
||||
if (DEBUG) show(s, firstPart, lastPart, "Fallback");
|
||||
return addTokenForString(s, addWord(firstPart), addWord(lastPart));
|
||||
}
|
||||
|
||||
|
||||
static void show(String s, String firstPart, String lastPart, String comment) {
|
||||
System.out.println((s) + " => '" + (firstPart)
|
||||
+ "' # '" + (lastPart) + "' " + comment);
|
||||
}
|
||||
|
||||
|
||||
static void startLines() {
|
||||
spacedMinimum = lastToken;
|
||||
}
|
||||
|
||||
|
||||
static int addLine(String s) {
|
||||
|
||||
|
||||
int result = tokenFromString(s);
|
||||
if (result != -1) return result;
|
||||
int bestLen = 0;
|
||||
int best_i = 0;
|
||||
|
||||
|
||||
int limit = s.length() - 2;
|
||||
|
||||
|
||||
for (int i = limit; i >= 1; --i) {
|
||||
char c = s.charAt(i);
|
||||
if (c != ' ') continue;
|
||||
|
@ -224,7 +237,7 @@ public class CompactName {
|
|||
|
||||
int lead = tokenFromString(firstPart);
|
||||
int trail = tokenFromString(lastPart);
|
||||
|
||||
|
||||
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
|
||||
if (DEBUG) show(s, firstPart, lastPart, "MATCH BOTH");
|
||||
return addTokenForString(s, lead, trail);
|
||||
|
@ -253,7 +266,7 @@ public class CompactName {
|
|||
return addTokenForString(s, addLine(firstPart), trail);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
System.out.println("SHOULD HAVE MATCHED!!");
|
||||
throw new IllegalArgumentException("SHOULD HAVE MATCHED!! " + s);
|
||||
}
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
@ -7,20 +20,20 @@ import java.text.NumberFormat;
|
|||
import java.io.*;
|
||||
|
||||
|
||||
/** Simple program to merge UCD files into XML. Not yet documented!!
|
||||
/** Simple program to merge UCD files into XML. Not yet documented!!
|
||||
* @author Mark Davis
|
||||
*/
|
||||
|
||||
public final class ConvertUCD implements UCD_Types {
|
||||
public static final boolean SHOW = true;
|
||||
public static final boolean DEBUG = false;
|
||||
|
||||
|
||||
public static int major;
|
||||
public static int minor;
|
||||
public static int update;
|
||||
|
||||
|
||||
static String version;
|
||||
|
||||
|
||||
// varies by version
|
||||
/*
|
||||
public static final String BASE_DIR11 = DATA_DIR + "\\Versions\\";
|
||||
|
@ -29,10 +42,10 @@ public final class ConvertUCD implements UCD_Types {
|
|||
public static final String BASE_DIR30 = DATA_DIR + "\\Update 3.0.1\\";
|
||||
public static final String BASE_DIR31 = DATA_DIR + "\\3.1-Update\\";
|
||||
*/
|
||||
|
||||
|
||||
//public static final String blocksnamePlain = "Blocks.txt";
|
||||
//public static final String blocksname31 = "Blocks-4d2.beta";
|
||||
|
||||
|
||||
/** First item is file name, rest are field names (skipping character).
|
||||
* "OMIT" is special -- means don't record
|
||||
*/
|
||||
|
@ -47,10 +60,10 @@ public final class ConvertUCD implements UCD_Types {
|
|||
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
|
||||
{"UnicodeData", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
{"ExtraProperties", "xp"},
|
||||
{"PropList", "binary"},
|
||||
|
||||
{"PropList", "binary"},
|
||||
|
||||
//{"ExtraProperties", "xp"},
|
||||
|
||||
|
||||
{"EastAsianWidth", "ea", "OMIT"},
|
||||
{"LineBreak", "lb", "OMIT"},
|
||||
{"SpecialCasing", "*sl", "*st", "*su", "sc"},
|
||||
|
@ -76,10 +89,10 @@ public final class ConvertUCD implements UCD_Types {
|
|||
// 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; <compat> 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB
|
||||
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
|
||||
{"UnicodeData-3.1.0d8.beta", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
{"PropList-3.1.0d5.beta", "binary"},
|
||||
|
||||
{"PropList-3.1.0d5.beta", "binary"},
|
||||
|
||||
{"ExtraProperties", "xp"},
|
||||
|
||||
|
||||
{"EastAsianWidth-4d7.beta", "ea", "OMIT"},
|
||||
{"LineBreak-6d6.beta", "lb", "OMIT"},
|
||||
{"SpecialCasing-4d1.beta", "*sl", "*st", "*su", "sc"},
|
||||
|
@ -98,13 +111,13 @@ public final class ConvertUCD implements UCD_Types {
|
|||
/*
|
||||
{"UnicodeData-3.1.0d8.beta", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
{"ExtraProperties", "xp"},
|
||||
|
||||
|
||||
{"EastAsianWidth-4d7.beta", "ea", "OMIT"},
|
||||
{"LineBreak-6d6.beta", "lb", "OMIT"},
|
||||
{"SpecialCasing-4d1.beta", "*sl", "*st", "*su", "sc"},
|
||||
{"CompositionExclusions-3d6.beta", "ce"},
|
||||
{"CaseFolding-3d4.beta", "OMIT", "*fc"},
|
||||
{"PropList-3.1.0d2.beta", "PROP", "OMIT"},
|
||||
{"PropList-3.1.0d2.beta", "PROP", "OMIT"},
|
||||
{"ArabicShaping", "OMIT", "jt", "jg"},
|
||||
{"BidiMirroring", "*bg"},
|
||||
{"Scripts-1d4", "sn"},
|
||||
|
@ -114,9 +127,9 @@ public final class ConvertUCD implements UCD_Types {
|
|||
/*
|
||||
{"Jamo", "jn"},
|
||||
//
|
||||
|
||||
|
||||
//"NamesList-3.1.0d1.beta"
|
||||
|
||||
|
||||
static String[][] labelList30 = {
|
||||
// Labels for the incoming files. Labels MUST match field order in file.
|
||||
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
|
||||
|
@ -133,28 +146,28 @@ public final class ConvertUCD implements UCD_Types {
|
|||
{"BidiMirroring", "*bg"},
|
||||
/*
|
||||
{"Jamo", "jn"},
|
||||
{"PropList.alpha", "RANGE", "OMIT"},
|
||||
{"PropList.alpha", "RANGE", "OMIT"},
|
||||
//
|
||||
};
|
||||
|
||||
|
||||
static String[][] labelList11 = {
|
||||
{"UnicodeData-1.1", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
};
|
||||
|
||||
|
||||
static String[][] labelList20 = {
|
||||
{"UnicodeData-2.0", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
};
|
||||
|
||||
|
||||
static String[][] labelList21 = {
|
||||
{"UnicodeData-2.1", "n", "gc", "cc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"},
|
||||
};
|
||||
*/
|
||||
|
||||
|
||||
// handles
|
||||
public static final String blocksname = "Blocks";
|
||||
//public static final String[][] labelList;
|
||||
public static final boolean NEWPROPS = true;
|
||||
|
||||
|
||||
/*
|
||||
static {
|
||||
switch (major*10 + minor) {
|
||||
|
@ -180,23 +193,23 @@ public final class ConvertUCD implements UCD_Types {
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
*/
|
||||
static final String dataFilePrefix = "UCD_Data";
|
||||
|
||||
|
||||
|
||||
|
||||
// MAIN!!
|
||||
|
||||
|
||||
public static void main (String[] args) throws Exception {
|
||||
System.out.println("ConvertUCD");
|
||||
|
||||
|
||||
log = new PrintWriter(new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(GEN_DIR + "UCD-log.txt"),
|
||||
"UTF8"),
|
||||
32*1024));
|
||||
log.write("\uFEFF"); // BOM
|
||||
|
||||
|
||||
try {
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
version = args[i];
|
||||
|
@ -206,14 +219,14 @@ public final class ConvertUCD implements UCD_Types {
|
|||
major = Integer.parseInt(parts[0]);
|
||||
minor = Integer.parseInt(parts[1]);
|
||||
update = Integer.parseInt(parts[2]);
|
||||
|
||||
|
||||
toJava();
|
||||
}
|
||||
} finally {
|
||||
log.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
static void toXML() throws Exception {
|
||||
// Blocks is special
|
||||
|
@ -228,7 +241,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
writeXML();
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
static void toJava() throws Exception {
|
||||
// Blocks is special
|
||||
// Unihan is special
|
||||
|
@ -239,7 +252,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
} else {
|
||||
readSemi(labelList[0]); // TESTING ONLY
|
||||
}
|
||||
|
||||
|
||||
Iterator it = charData.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Object key = it.next();
|
||||
|
@ -253,40 +266,40 @@ public final class ConvertUCD implements UCD_Types {
|
|||
|
||||
writeJavaData();
|
||||
}
|
||||
|
||||
|
||||
static PrintWriter log;
|
||||
//static String directory = BASE_DIR;
|
||||
//static Map appendDuplicates = new HashMap();
|
||||
|
||||
|
||||
/** First item in labels is file name, rest are field names (skipping character).
|
||||
* "OMIT" is special -- means don't record
|
||||
*/
|
||||
|
||||
|
||||
static HashMap isHex = new HashMap();
|
||||
static HashMap defaults = new HashMap();
|
||||
|
||||
|
||||
static {
|
||||
for (int j = 0; j < labelList.length; ++j) {
|
||||
String[] labels = labelList[j];
|
||||
|
||||
|
||||
for (int i = 1; i < labels.length; ++i) {
|
||||
boolean hex = false;
|
||||
String def = null;
|
||||
//char appendChar = '\u0000';
|
||||
|
||||
|
||||
// pull off "*": hex interpretation
|
||||
if (labels[i].charAt(0) == '*') { // HEX value
|
||||
hex = true;
|
||||
labels[i] = labels[i].substring(1);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
// pull off "$": append duplicates
|
||||
if (labels[i].charAt(0) == '$') { // HEX value
|
||||
appendChar = labels[i].charAt(1);
|
||||
labels[i] = labels[i].substring(2);
|
||||
}
|
||||
|
||||
|
||||
// pull off default values
|
||||
int pos = labels[i].indexOf('-');
|
||||
if (pos >= 0) {
|
||||
|
@ -296,16 +309,16 @@ public final class ConvertUCD implements UCD_Types {
|
|||
*/
|
||||
// store results
|
||||
// we do this after all processing, so that the label is clean!!
|
||||
|
||||
|
||||
if (hex) isHex.put(labels[i], "");
|
||||
//if (appendChar != 0) appendDuplicates.put(labels[i], String.valueOf(appendChar));
|
||||
defaults.put(labels[i], def);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static List blockData = new LinkedList();
|
||||
|
||||
|
||||
static void readBlocks() throws Exception {
|
||||
System.out.println("Reading 'Blocks'");
|
||||
BufferedReader input = Utility.openUnicodeFile(blocksname, version);
|
||||
|
@ -316,7 +329,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
line = input.readLine();
|
||||
if (line == null) break;
|
||||
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
|
||||
|
||||
|
||||
//String original = line;
|
||||
String comment = "";
|
||||
int commentPos = line.indexOf('#');
|
||||
|
@ -326,12 +339,12 @@ public final class ConvertUCD implements UCD_Types {
|
|||
}
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
|
||||
int count = Utility.split(line,';',parts);
|
||||
if (count != 3) throw new ChainException("Bad count in Blocks", null);
|
||||
blockData.add(new String[] {Utility.fromHex(parts[0]), Utility.fromHex(parts[1]), parts[2].trim()});
|
||||
}
|
||||
|
||||
|
||||
} catch (Exception e) {
|
||||
System.out.println("Exception at: " + line);
|
||||
throw e;
|
||||
|
@ -339,9 +352,9 @@ public final class ConvertUCD implements UCD_Types {
|
|||
input.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static Set properties = new TreeSet();
|
||||
|
||||
|
||||
static void readSemi(String[] labels) throws Exception {
|
||||
System.out.println();
|
||||
System.out.println("Reading '" + labels[0] + "'");
|
||||
|
@ -361,14 +374,14 @@ public final class ConvertUCD implements UCD_Types {
|
|||
boolean showedSemi = false;
|
||||
boolean showedShort = false;
|
||||
String line = "";
|
||||
|
||||
|
||||
try {
|
||||
String[] parts = new String[20];
|
||||
for (int lineNumber = 1; ; ++lineNumber) {
|
||||
line = input.readLine();
|
||||
if (line == null) break;
|
||||
if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'");
|
||||
|
||||
|
||||
String original = line;
|
||||
String comment = "";
|
||||
int commentPos = line.indexOf('#');
|
||||
|
@ -378,15 +391,15 @@ public final class ConvertUCD implements UCD_Types {
|
|||
}
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
|
||||
int count = Utility.split(line,';',parts);
|
||||
|
||||
|
||||
if (parts[0].equals("2801")) {
|
||||
System.out.println("debug?");
|
||||
}
|
||||
|
||||
|
||||
// fix malformed or simple lists.
|
||||
|
||||
|
||||
if (count != labels.length) {
|
||||
if (count == labels.length + 1 && parts[count-1].equals("")) {
|
||||
if (!showedSemi) System.out.println("Extra semicolon in: " + original);
|
||||
|
@ -401,11 +414,11 @@ public final class ConvertUCD implements UCD_Types {
|
|||
parts[i] = "";
|
||||
}
|
||||
} else {
|
||||
throw new ChainException("wrong count: {0}",
|
||||
throw new ChainException("wrong count: {0}",
|
||||
new Object[] {new Integer(line), new Integer(count)});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// store char
|
||||
// first field is always character OR range. May be UTF-32
|
||||
int cpTop;
|
||||
|
@ -420,9 +433,9 @@ public final class ConvertUCD implements UCD_Types {
|
|||
cpTop = cpStart;
|
||||
if (labels[1].equals("RANGE")) UTF32.char32At(Utility.fromHex(parts[1]),0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// properties first
|
||||
if (labels[1].equals("PROP")) {
|
||||
String prop = parts[2].trim();
|
||||
|
@ -436,7 +449,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
properties.add(prop);
|
||||
if (Utility.find(prop, UCD_Names.DeletedProperties) == -1) { // only undeleted
|
||||
int end = UTF32.char32At(Utility.fromHex(parts[1]),0);
|
||||
if (end == 0) end = cpStart;
|
||||
if (end == 0) end = cpStart;
|
||||
|
||||
for (int j = cpStart; j <= end; ++j) {
|
||||
if (j != UCD.mapToRepresentative(j, false)) continue;
|
||||
|
@ -447,7 +460,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
} else { // not range!
|
||||
String val = "";
|
||||
String lastVal;
|
||||
|
||||
|
||||
for (int i = 1; i < labels.length; ++i) {
|
||||
String key = labels[i];
|
||||
lastVal = val;
|
||||
|
@ -462,7 +475,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
|
||||
for (int cps = cpStart; cps <= cpTop; ++cps) {
|
||||
if (UCD.mapToRepresentative(cps, false) != cps) continue; // skip condensed ranges
|
||||
|
||||
|
||||
if (key.equals("binary")) {
|
||||
appendCharProperties(cps, val);
|
||||
} else if (key.equals("fc")) {
|
||||
|
@ -502,7 +515,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
//printValues("JOINING_TYPE", jtSet);
|
||||
//printValues("JOINING_GROUP", jgSet);
|
||||
}
|
||||
|
||||
|
||||
static void printValues(String title, Set s) {
|
||||
Iterator it = s.iterator();
|
||||
System.out.println("public static String[] " + title + " = {");
|
||||
|
@ -521,9 +534,9 @@ public final class ConvertUCD implements UCD_Types {
|
|||
System.out.println(" LIMIT_" + title + " = " + count);
|
||||
System.out.println(";");
|
||||
}
|
||||
|
||||
|
||||
static Map charData = new TreeMap();
|
||||
|
||||
|
||||
static void writeXML() throws IOException {
|
||||
System.out.println("Writing 'UCD-Main.xml'");
|
||||
BufferedWriter output = new BufferedWriter(
|
||||
|
@ -531,29 +544,29 @@ public final class ConvertUCD implements UCD_Types {
|
|||
new FileOutputStream(UCD.BIN_DIR + "UCD_Data.xml"),
|
||||
"UTF8"),
|
||||
32*1024);
|
||||
|
||||
|
||||
try {
|
||||
// write header
|
||||
|
||||
|
||||
output.write("<?xml version='1.0' encoding='utf-8'?>\r\n");
|
||||
output.write("<UnicodeCharacterDatabase>\r\n");
|
||||
output.write(" <!-- IMPORTANT: see UCD-Notes.html for information on the format. This file CANNOT be read correctly without that information. -->\r\n");
|
||||
output.write(" <unicode version='" + major + "' minor='" + minor + "' update='" + update + "'/>\r\n");
|
||||
output.write(" <fileVersion status='DRAFT' date='" + new Date() + "'/>\r\n");
|
||||
|
||||
|
||||
// write blocks
|
||||
|
||||
|
||||
Iterator it = blockData.iterator();
|
||||
while (it.hasNext()) {
|
||||
String[] block = (String[]) it.next();
|
||||
output.write(" <block start='" + Utility.quoteXML(block[0])
|
||||
output.write(" <block start='" + Utility.quoteXML(block[0])
|
||||
+ "' end='" + Utility.quoteXML(block[1])
|
||||
+ "' name='" + Utility.quoteXML(block[2])
|
||||
+ "'/>\r\n" );
|
||||
}
|
||||
|
||||
|
||||
// write char data
|
||||
|
||||
|
||||
it = charData.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Integer cc = (Integer) it.next();
|
||||
|
@ -575,9 +588,9 @@ public final class ConvertUCD implements UCD_Types {
|
|||
*/
|
||||
output.write("/>\r\n");
|
||||
}
|
||||
|
||||
|
||||
// write footer
|
||||
|
||||
|
||||
output.write("</UnicodeCharacterDatabase>\r\n");
|
||||
} finally {
|
||||
output.close();
|
||||
|
@ -592,7 +605,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
new BufferedOutputStream(
|
||||
new FileOutputStream(UCD.BIN_DIR + dataFilePrefix + version + ".bin"),
|
||||
128*1024));
|
||||
|
||||
|
||||
// write header
|
||||
dataOut.writeByte(BINARY_FORMAT);
|
||||
dataOut.writeByte(major);
|
||||
|
@ -603,7 +616,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
dataOut.writeInt(charData.size());
|
||||
System.out.println("Data Size: " + NumberFormat.getInstance().format(charData.size()));
|
||||
int count = 0;
|
||||
|
||||
|
||||
// write records
|
||||
try {
|
||||
// write char data
|
||||
|
@ -612,7 +625,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
Object cc = (Object) it.next();
|
||||
//codePoint = UTF32.char32At(cc,0);
|
||||
if (DEBUG) System.out.println(Utility.hex(cc));
|
||||
|
||||
|
||||
UData uData = (UData) charData.get(cc);
|
||||
if (false && uData.name == null) {
|
||||
System.out.println("Warning: NULL name\r\n" + uData);
|
||||
|
@ -632,13 +645,13 @@ public final class ConvertUCD implements UCD_Types {
|
|||
dataOut.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static String[] xsSplit = new String[40];
|
||||
|
||||
|
||||
// Cache a little bit for speed
|
||||
static int getEntryCodePoint = -1;
|
||||
static UData getEntryUData = null;
|
||||
|
||||
|
||||
static UData getEntryIfExists(int cp) {
|
||||
if (cp == getEntryCodePoint) return getEntryUData;
|
||||
Integer cc = new Integer(cp);
|
||||
|
@ -648,7 +661,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
getEntryUData = charEntry;
|
||||
return charEntry;
|
||||
}
|
||||
|
||||
|
||||
/* Get entry in table for cc
|
||||
*/
|
||||
static UData getEntry(int cp) {
|
||||
|
@ -671,7 +684,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
UData charEntry = getEntry(cp);
|
||||
charEntry.binaryProperties |= (1 << binProp);
|
||||
}
|
||||
|
||||
|
||||
static void appendCharProperties(int cp, String key) {
|
||||
int ind;
|
||||
//if (true || NEWPROPS) {
|
||||
|
@ -683,17 +696,17 @@ public final class ConvertUCD implements UCD_Types {
|
|||
//charEntry.binaryProperties |= (1 << ind);
|
||||
setBinaryProperty(cp, ind);
|
||||
}
|
||||
|
||||
|
||||
static Set jtSet = new TreeSet();
|
||||
static Set jgSet = new TreeSet();
|
||||
|
||||
|
||||
/** Adds the character data. Signals duplicates with an exception
|
||||
*/
|
||||
static void addCharData(int cp, String key, String value) {
|
||||
//if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + Utility.quoteJavaString(value));
|
||||
UData charEntry = getEntry(cp);
|
||||
//if (cp < 10) System.out.println(" " + charEntry);
|
||||
|
||||
|
||||
if (key.equals("bm")) {
|
||||
if (value.equals("Y")) charEntry.binaryProperties |= 1;
|
||||
} else if (key.equals("ce")) {
|
||||
|
@ -723,7 +736,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
}
|
||||
}
|
||||
setField(charEntry, key, Utility.fromHex(value));
|
||||
|
||||
|
||||
// fix the numeric fields to be more sensible
|
||||
} else if (key.equals("dd")) {
|
||||
if (charEntry.numericType < UCD_Types.DECIMAL) {
|
||||
|
@ -749,7 +762,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
setField(charEntry, key, value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static public void setField(UData uData, String fieldName, String fieldValue) {
|
||||
try {
|
||||
if (fieldName.equals("n")) {
|
||||
|
@ -764,17 +777,17 @@ public final class ConvertUCD implements UCD_Types {
|
|||
uData.simpleLowercase = fieldValue;
|
||||
} else if (fieldName.equals("tc")) {
|
||||
uData.simpleTitlecase = fieldValue;
|
||||
|
||||
|
||||
} else if (fieldName.equals("su")) {
|
||||
uData.fullUppercase = fieldValue;
|
||||
} else if (fieldName.equals("sl")) {
|
||||
uData.fullLowercase = fieldValue;
|
||||
} else if (fieldName.equals("st")) {
|
||||
uData.fullTitlecase = fieldValue;
|
||||
|
||||
|
||||
} else if (fieldName.equals("sc")) {
|
||||
uData.specialCasing = fieldValue;
|
||||
|
||||
|
||||
} else if (fieldName.equals("xp")) {
|
||||
uData.binaryProperties |= 1 << Utility.lookup(fieldValue, UCD_Names.BP);
|
||||
//UCD_Names.BP_OLD
|
||||
|
@ -796,20 +809,20 @@ public final class ConvertUCD implements UCD_Types {
|
|||
uData.decompositionType = Utility.lookup(fieldValue, UCD_Names.DT);
|
||||
} else if (fieldName.equals("nt")) {
|
||||
uData.numericType = Utility.lookup(fieldValue, UCD_Names.NT);
|
||||
|
||||
|
||||
} else if (fieldName.equals("ea")) {
|
||||
uData.eastAsianWidth = Utility.lookup(fieldValue, UCD_Names.EA);
|
||||
} else if (fieldName.equals("lb")) {
|
||||
uData.lineBreak = Utility.lookup(fieldValue, UCD_Names.LB);
|
||||
|
||||
|
||||
} else if (fieldName.equals("sn")) {
|
||||
uData.script = Utility.lookup(fieldValue, UCD_Names.SCRIPT);
|
||||
|
||||
|
||||
} else if (fieldName.equals("jt")) {
|
||||
uData.joiningType = Utility.lookup(fieldValue, UCD_Names.JOINING_TYPE);
|
||||
} else if (fieldName.equals("jg")) {
|
||||
uData.joiningGroup = Utility.lookup(fieldValue, UCD_Names.OLD_JOINING_GROUP);
|
||||
|
||||
|
||||
} else if (fieldName.equals("nv")) {
|
||||
if (major < 2) {
|
||||
if (fieldValue.equals("-")) return;
|
||||
|
@ -827,5 +840,5 @@ public final class ConvertUCD implements UCD_Types {
|
|||
"Bad field name= \"{0}\", value= \"{1}\"", new Object[] {fieldName, fieldValue}, e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
@ -6,52 +19,52 @@ import com.ibm.text.utility.*;
|
|||
|
||||
final class DerivedPropertyLister extends PropertyLister {
|
||||
static final boolean BRIDGE = false;
|
||||
|
||||
|
||||
static int enum = 0;
|
||||
static final int
|
||||
PropMath = 0,
|
||||
PropAlphabetic = 1,
|
||||
PropLowercase = 2,
|
||||
PropUppercase = 3,
|
||||
|
||||
|
||||
ID_Start = 4,
|
||||
ID_Continue_NO_Cf = 5,
|
||||
|
||||
|
||||
Mod_ID_Start = 6,
|
||||
Mod_ID_Continue_NO_Cf = 7,
|
||||
|
||||
|
||||
Missing_Uppercase = 8,
|
||||
Missing_Lowercase = 9,
|
||||
Missing_Mixedcase = 10,
|
||||
|
||||
|
||||
FC_NFKC_Closure = 11,
|
||||
|
||||
|
||||
FullCompExclusion = 12,
|
||||
FullCompInclusion = 13,
|
||||
|
||||
|
||||
QuickNFD = 14,
|
||||
QuickNFC = 15,
|
||||
QuickNFKD = 16,
|
||||
QuickNFKC = 17,
|
||||
|
||||
|
||||
ExpandsOnNFD = 18,
|
||||
ExpandsOnNFC = 19,
|
||||
ExpandsOnNFKD = 20,
|
||||
ExpandsOnNFKC = 21,
|
||||
|
||||
|
||||
GenNFD = 22,
|
||||
GenNFC = 23,
|
||||
GenNFKD = 24,
|
||||
GenNFKC = 25,
|
||||
|
||||
|
||||
LIMIT = 26;
|
||||
;
|
||||
|
||||
|
||||
private int propMask;
|
||||
private Normalizer[] nf = new Normalizer[4];
|
||||
private Normalizer nfd, nfc, nfkd, nfkc;
|
||||
int width;
|
||||
|
||||
|
||||
public DerivedPropertyLister(UCD ucd, int propMask, PrintStream output) {
|
||||
this.propMask = propMask;
|
||||
this.output = output;
|
||||
|
@ -60,7 +73,7 @@ final class DerivedPropertyLister extends PropertyLister {
|
|||
nfc = nf[1] = new Normalizer(Normalizer.NFC);
|
||||
nfkd = nf[2] = new Normalizer(Normalizer.NFKD);
|
||||
nfkc = nf[3] = new Normalizer(Normalizer.NFKC);
|
||||
|
||||
|
||||
width = super.minPropertyWidth();
|
||||
switch (propMask) {
|
||||
case GenNFD: case GenNFC: case GenNFKD: case GenNFKC:
|
||||
|
@ -75,7 +88,7 @@ final class DerivedPropertyLister extends PropertyLister {
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public String headerString() {
|
||||
String result = "# Derived Property: ";
|
||||
switch (propMask) {
|
||||
|
@ -88,31 +101,31 @@ final class DerivedPropertyLister extends PropertyLister {
|
|||
case GenNFD: case GenNFC: case GenNFKD: case GenNFKC:
|
||||
result += NAME[propMask-GenNFD] + "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# Normalized forms, where different from the characters themselves."
|
||||
+ ((propMask == 5 || propMask == 3)
|
||||
+ ((propMask == 5 || propMask == 3)
|
||||
? ""
|
||||
: "\r\n# HANGUL SYLLABLES are algorithmically decomposed, and not listed explicitly.")
|
||||
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
|
||||
+ "\r\n# It is NOT sufficient to replace characters one-by-one with these results!";
|
||||
break;
|
||||
case ID_Start: result +=
|
||||
case ID_Start: result +=
|
||||
"ID_Start"
|
||||
+ "\r\n# Characters that can start an identifier."
|
||||
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl";
|
||||
break;
|
||||
case ID_Continue_NO_Cf: result +=
|
||||
case ID_Continue_NO_Cf: result +=
|
||||
"ID_Continue"
|
||||
+ "\r\n# Characters that can continue an identifier."
|
||||
+ "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc"
|
||||
+ "\r\n# NOTE: Cf characters should be filtered out.";
|
||||
break;
|
||||
case Mod_ID_Start: result +=
|
||||
case Mod_ID_Start: result +=
|
||||
"XID_Start"
|
||||
+ "\r\n# ID_Start modified for closure under NFKx"
|
||||
+ "\r\n# Modified as described in UAX #15"
|
||||
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
|
||||
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
|
||||
break;
|
||||
case Mod_ID_Continue_NO_Cf: result +=
|
||||
case Mod_ID_Continue_NO_Cf: result +=
|
||||
"XID_Continue"
|
||||
+ "\r\n# Mod_ID_Continue modified for closure under NFKx"
|
||||
+ "\r\n# Modified as described in UAX #15"
|
||||
|
@ -124,7 +137,7 @@ final class DerivedPropertyLister extends PropertyLister {
|
|||
result += "Math"
|
||||
+ "\r\n# Generated from: Sm + Other_Math";
|
||||
break;
|
||||
case PropAlphabetic:
|
||||
case PropAlphabetic:
|
||||
result += "Alphabetic"
|
||||
+ "\r\n# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic";
|
||||
break;
|
||||
|
@ -201,17 +214,17 @@ final class DerivedPropertyLister extends PropertyLister {
|
|||
default: return "Unimplemented!!";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//public String optionalComment(int cp) {
|
||||
// return super.optionalComment(cp) + " [" + ucdData.getCodeAndName(computedValue) + "]";
|
||||
//}
|
||||
|
||||
|
||||
|
||||
|
||||
public int minPropertyWidth() {
|
||||
return width;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static final String[] NAME = {"NFD", "NFC", "NFKD", "NFKC"};
|
||||
/*
|
||||
public String optionalComment(int cp) {
|
||||
|
@ -229,8 +242,8 @@ final class DerivedPropertyLister extends PropertyLister {
|
|||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
public byte status(int cp) {
|
||||
if (!ucdData.isAssigned(cp)) return EXCLUDE;
|
||||
//if (cp == 0xFFFF) {
|
||||
|
@ -240,13 +253,13 @@ final class DerivedPropertyLister extends PropertyLister {
|
|||
//if (cp == 0x0385) {
|
||||
// System.out.println(Utility.hex(firstRealCp));
|
||||
//}
|
||||
|
||||
|
||||
String cps;
|
||||
byte xCat;
|
||||
|
||||
|
||||
switch (propMask) {
|
||||
default: return EXCLUDE;
|
||||
|
||||
|
||||
case ExpandsOnNFD: case ExpandsOnNFC: case ExpandsOnNFKD: case ExpandsOnNFKC:
|
||||
if (ucdData.getDecompositionType(cp) == NONE) return EXCLUDE;
|
||||
cps = UTF32.valueOf32(cp);
|
||||
|
@ -307,17 +320,17 @@ final class DerivedPropertyLister extends PropertyLister {
|
|||
return EXCLUDE;
|
||||
case FullCompExclusion:
|
||||
/*
|
||||
(3) Singleton Decompositions: characters that can be derived from the UnicodeData file by
|
||||
(3) Singleton Decompositions: characters that can be derived from the UnicodeData file by
|
||||
including all characters whose canonical decomposition consists of a single character.
|
||||
(4) Non-Starter Decompositions: characters that can be derived from the UnicodeData
|
||||
file by including all characters whose canonical decomposition consists of a sequence
|
||||
of characters, the first of which has a non-zero combining class.
|
||||
*/
|
||||
*/
|
||||
{
|
||||
if (!ucdData.isRepresented(cp)) return EXCLUDE;
|
||||
byte dtype = ucdData.getDecompositionType(cp);
|
||||
if (dtype != CANONICAL) return EXCLUDE;
|
||||
|
||||
|
||||
if (isCompEx(cp)) return INCLUDE;
|
||||
return EXCLUDE;
|
||||
}
|
||||
|
@ -326,13 +339,13 @@ of characters, the first of which has a non-zero combining class.
|
|||
if (!ucdData.isRepresented(cp)) return EXCLUDE;
|
||||
byte dtype = ucdData.getDecompositionType(cp);
|
||||
if (dtype != CANONICAL) return EXCLUDE;
|
||||
|
||||
|
||||
if (isCompEx(cp)) return EXCLUDE;
|
||||
return INCLUDE;
|
||||
}
|
||||
case FC_NFKC_Closure:
|
||||
if (!ucdData.isRepresented(cp)) return EXCLUDE;
|
||||
|
||||
|
||||
/*
|
||||
b = Normalize(Fold(a));
|
||||
c = Normalize(Fold(b));
|
||||
|
@ -353,7 +366,7 @@ of characters, the first of which has a non-zero combining class.
|
|||
}
|
||||
return BREAK;
|
||||
}
|
||||
|
||||
|
||||
case QuickNFD: case QuickNFC: case QuickNFKD: case QuickNFKC:
|
||||
lastValue = currentValue;
|
||||
Normalizer nfx = nf[propMask - QuickNFD];
|
||||
|
@ -364,8 +377,8 @@ of characters, the first of which has a non-zero combining class.
|
|||
if (currentValue != lastValue) return BREAK;
|
||||
return INCLUDE;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// handle script stuff
|
||||
/*
|
||||
if (firstRealCp == -1) return INCLUDE;
|
||||
|
@ -373,12 +386,12 @@ of characters, the first of which has a non-zero combining class.
|
|||
if (cat == cat2) return INCLUDE;
|
||||
int mc = UCD.mainCategoryMask(cat);
|
||||
if (LETTER_MASK == mc && mc == UCD.mainCategoryMask(cat2)) return INCLUDE;
|
||||
|
||||
|
||||
return BREAK;
|
||||
*/
|
||||
return INCLUDE;
|
||||
}
|
||||
|
||||
|
||||
static Map computedValue = new HashMap();
|
||||
static String getComputedValue(int cp) {
|
||||
return (String) computedValue.get(new Integer(cp));
|
||||
|
@ -388,8 +401,8 @@ of characters, the first of which has a non-zero combining class.
|
|||
}
|
||||
static String lastValue = "";
|
||||
static String currentValue = "";
|
||||
|
||||
boolean isCompEx(int cp) {
|
||||
|
||||
boolean isCompEx(int cp) {
|
||||
if (ucdData.getBinaryProperty(cp, CompositionExclusion)) return true;
|
||||
String decomp = ucdData.getDecompositionMapping(cp);
|
||||
if (UTF32.length32(decomp) == 1) return true;
|
||||
|
@ -397,17 +410,17 @@ of characters, the first of which has a non-zero combining class.
|
|||
if (ucdData.getCombiningClass(first) != 0) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
StringBuffer foldBuffer = new StringBuffer();
|
||||
|
||||
|
||||
String fold(int cp) {
|
||||
return ucdData.getCase(cp, FULL, FOLD);
|
||||
}
|
||||
|
||||
|
||||
String fold(String s) {
|
||||
return ucdData.getCase(s, FULL, FOLD);
|
||||
}
|
||||
|
||||
|
||||
byte getDecompCat(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Lu
|
||||
|
@ -416,7 +429,7 @@ of characters, the first of which has a non-zero combining class.
|
|||
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return Ll;
|
||||
if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) return cat;
|
||||
if (!nf[2].normalizationDiffers(cp)) return Lo;
|
||||
|
||||
|
||||
String norm = nf[2].normalize(cp);
|
||||
int cp2;
|
||||
boolean gotUpper = false;
|
||||
|
@ -437,4 +450,4 @@ of characters, the first of which has a non-zero combining class.
|
|||
return cat;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,23 +1,36 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
|
||||
class DiffPropertyLister extends PropertyLister {
|
||||
private UCD oldUCD;
|
||||
|
||||
|
||||
public DiffPropertyLister(String oldUCDName, String newUCDName, PrintStream output) {
|
||||
this.output = output;
|
||||
this.ucdData = UCD.make(newUCDName);
|
||||
if (oldUCDName != null) this.oldUCD = UCD.make(oldUCDName);
|
||||
}
|
||||
|
||||
|
||||
public byte status (int cp) {
|
||||
return INCLUDE;
|
||||
}
|
||||
|
||||
|
||||
public String propertyName(int cp) {
|
||||
return ucdData.getVersion();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
public String optionalName(int cp) {
|
||||
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
|
||||
|
@ -27,7 +40,7 @@ class DiffPropertyLister extends PropertyLister {
|
|||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
public byte status(int lastCp, int cp) {
|
||||
/*if (cp == 0xFFFF) {
|
||||
|
@ -36,7 +49,7 @@ class DiffPropertyLister extends PropertyLister {
|
|||
*/
|
||||
return ucdData.isAllocated(cp) && (oldUCD == null || !oldUCD.isAllocated(cp)) ? INCLUDE : EXCLUDE;
|
||||
}
|
||||
|
||||
|
||||
public int print() {
|
||||
String status;
|
||||
if (oldUCD != null) {
|
||||
|
@ -56,10 +69,10 @@ class DiffPropertyLister extends PropertyLister {
|
|||
} else {
|
||||
output.println("# Total " + count + " code points allocated in " + ucdData.getVersion());
|
||||
}
|
||||
|
||||
|
||||
output.println();
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
|
@ -8,19 +21,19 @@ import com.ibm.text.utility.*;
|
|||
public class GenerateCaseFolding implements UCD_Types {
|
||||
public static boolean DEBUG = false;
|
||||
public static UCD ucd = UCD.make("310");
|
||||
|
||||
|
||||
public static void main(String[] args) throws java.io.IOException {
|
||||
makeCaseFold();
|
||||
//getAge();
|
||||
}
|
||||
|
||||
|
||||
public static void makeCaseFold() throws java.io.IOException {
|
||||
System.out.println("Making Full Data");
|
||||
Map fullData = getCaseFolding(true);
|
||||
System.out.println("Making Simple Data");
|
||||
Map simpleData = getCaseFolding(false);
|
||||
// write the data
|
||||
|
||||
|
||||
System.out.println("Writing");
|
||||
PrintWriter out = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
|
@ -48,30 +61,30 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
}
|
||||
out.close();
|
||||
}
|
||||
|
||||
|
||||
static void drawLine(PrintWriter out, int ch, String type, String result) {
|
||||
out.println(Utility.hex(ch)
|
||||
+ "; " + type +
|
||||
"; " + Utility.hex(result, " ") +
|
||||
out.println(Utility.hex(ch)
|
||||
+ "; " + type +
|
||||
"; " + Utility.hex(result, " ") +
|
||||
"; # " + ucd.getName(ch));
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static Map getCaseFolding(boolean full) throws java.io.IOException {
|
||||
Map data = new TreeMap();
|
||||
Map repChar = new TreeMap();
|
||||
//String option = "";
|
||||
|
||||
|
||||
// get the equivalence classes
|
||||
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
|
||||
if (!ucd.isRepresented(ch)) continue;
|
||||
getClosure(ch, data, full);
|
||||
}
|
||||
|
||||
|
||||
// get the representative characters
|
||||
|
||||
|
||||
Iterator it = data.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
|
@ -93,7 +106,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
}
|
||||
if (rep == null) System.err.println("No representative for: " + toString(set));
|
||||
else if (repGood < 128) {
|
||||
System.err.println("Non-optimal!!: "
|
||||
System.err.println("Non-optimal!!: "
|
||||
+ ucd.getName(rep) + ", " + toString(set,true));
|
||||
}
|
||||
it2 = set.iterator();
|
||||
|
@ -104,7 +117,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
}
|
||||
return repChar;
|
||||
}
|
||||
|
||||
|
||||
static int goodness(String s, boolean full) {
|
||||
if (s == null) return 0;
|
||||
int result = s.length();
|
||||
|
@ -113,7 +126,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static Normalizer NFC = new Normalizer(Normalizer.NFC);
|
||||
/*
|
||||
static HashSet temp = new HashSet();
|
||||
|
@ -135,12 +148,12 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
String
|
||||
String
|
||||
String lower1 = ucd.getLowercase(ch);
|
||||
String lower2 = ucd.toLowercase(ch,option);
|
||||
|
||||
|
||||
char ch2 = ucd.getLowercase(ucd.getUppercase(ch).charAt(0)).charAt(0);
|
||||
//String lower1 = String.valueOf(ucd.getLowercase(ch));
|
||||
//String lower = ucd.toLowercase(ch2,option);
|
||||
|
@ -148,9 +161,9 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
String lowerUpper = ucd.toLowercase(upper,option);
|
||||
//String title = ucd.toTitlecase(ch2,option);
|
||||
//String lowerTitle = ucd.toLowercase(upper,option);
|
||||
|
||||
if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { //
|
||||
output.println(Utility.hex(ch)
|
||||
|
||||
if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { //
|
||||
output.println(Utility.hex(ch)
|
||||
+ "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E")
|
||||
+ "; " + Utility.hex(lowerUpper," ")
|
||||
+ ";\t#" + ucd.getName(ch)
|
||||
|
@ -163,7 +176,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
//}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
static void getClosure(int ch, Map data, boolean full) {
|
||||
String charStr = UTF32.valueOf32(ch);
|
||||
String lowerStr = lower(charStr, full);
|
||||
|
@ -171,17 +184,17 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
String upperStr = upper(charStr, full);
|
||||
if (charStr.equals(lowerStr) && charStr.equals(upperStr) && charStr.equals(titleStr)) return;
|
||||
if (DEBUG) System.err.println("Closure for " + Utility.hex(ch));
|
||||
|
||||
|
||||
// make new set
|
||||
Set set = new TreeSet();
|
||||
set.add(charStr);
|
||||
data.put(charStr, set);
|
||||
|
||||
|
||||
// add cases to get started
|
||||
add(set, lowerStr, data);
|
||||
add(set, upperStr, data);
|
||||
add(set, titleStr, data);
|
||||
|
||||
|
||||
// close it
|
||||
main:
|
||||
while (true) {
|
||||
|
@ -197,15 +210,15 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static String lower(String s, boolean full) {
|
||||
String result = lower2(s,full);
|
||||
return result.replace('\u03C2', '\u03C3'); // HACK for lower
|
||||
}
|
||||
|
||||
|
||||
// These functions are no longer necessary, since UCD is parameterized,
|
||||
// but it's not worth changing
|
||||
|
||||
|
||||
static String lower2(String s, boolean full) {
|
||||
if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
|
@ -213,7 +226,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
}
|
||||
return ucd.getCase(s, FULL, LOWER);
|
||||
}
|
||||
|
||||
|
||||
static String upper(String s, boolean full) {
|
||||
if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
|
@ -221,7 +234,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
}
|
||||
return ucd.getCase(s, SIMPLE, UPPER);
|
||||
}
|
||||
|
||||
|
||||
static String title(String s, boolean full) {
|
||||
if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
|
@ -229,7 +242,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
}
|
||||
return ucd.getCase(s, SIMPLE, TITLE);
|
||||
}
|
||||
|
||||
|
||||
static boolean add(Set set, String s, Map data) {
|
||||
if (set.contains(s)) return false;
|
||||
set.add(s);
|
||||
|
@ -246,7 +259,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
if (DEBUG) System.err.println("done adding: " + toString(set));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static String toString(Set set) {
|
||||
String result = "{";
|
||||
Iterator it2 = set.iterator();
|
||||
|
@ -259,7 +272,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
}
|
||||
return result + "}";
|
||||
}
|
||||
|
||||
|
||||
static String toString(Set set, boolean t) {
|
||||
String result = "{";
|
||||
Iterator it2 = set.iterator();
|
||||
|
@ -272,7 +285,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
}
|
||||
return result + "}";
|
||||
}
|
||||
|
||||
|
||||
static final void getAge() throws IOException {
|
||||
PrintStream log = new PrintStream(
|
||||
new BufferedOutputStream (
|
||||
|
@ -298,37 +311,37 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
UnicodeSet u21 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.1.txt", false);
|
||||
UnicodeSet u30 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.0.txt", false);
|
||||
UnicodeSet u31 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.1.txt", false);
|
||||
|
||||
|
||||
log.println();
|
||||
log.println("# Code points assigned in Unicode 1.1 (minus Hangul Syllables): "
|
||||
log.println("# Code points assigned in Unicode 1.1 (minus Hangul Syllables): "
|
||||
+ n.format(u11.count()));
|
||||
log.println();
|
||||
u11.print(log, false, false, "1.1");
|
||||
|
||||
|
||||
UnicodeSet u20m = new UnicodeSet(u20).remove(u11);
|
||||
log.println();
|
||||
log.println("# Code points assigned in Unicode 2.0 (minus Unicode 1.1): "
|
||||
log.println("# Code points assigned in Unicode 2.0 (minus Unicode 1.1): "
|
||||
+ n.format(u20m.count()));
|
||||
log.println();
|
||||
u20m.print(log, false, false, "2.0");
|
||||
|
||||
UnicodeSet u21m = new UnicodeSet(u21).remove(u20);
|
||||
log.println();
|
||||
log.println("# Code points assigned in Unicode 2.1 (minus Unicode 2.0): "
|
||||
log.println("# Code points assigned in Unicode 2.1 (minus Unicode 2.0): "
|
||||
+ n.format(u21m.count()));
|
||||
log.println();
|
||||
u21m.print(log, false, false, "2.1");
|
||||
|
||||
UnicodeSet u30m = new UnicodeSet(u30).remove(u21);
|
||||
log.println();
|
||||
log.println("# Code points assigned in Unicode 3.0 (minus Unicode 2.1): "
|
||||
log.println("# Code points assigned in Unicode 3.0 (minus Unicode 2.1): "
|
||||
+ n.format(u30m.count()));
|
||||
log.println();
|
||||
u30m.print(log, false, false, "3.0");
|
||||
|
||||
UnicodeSet u31m = new UnicodeSet(u31).remove(u30);
|
||||
log.println();
|
||||
log.println("# Code points assigned in Unicode 3.1 (minus Unicode 3.0): "
|
||||
log.println("# Code points assigned in Unicode 3.1 (minus Unicode 3.0): "
|
||||
+ n.format(u31m.count()));
|
||||
log.println();
|
||||
u31m.print(log, false, false, "3.1");
|
||||
|
@ -336,7 +349,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
} finally {
|
||||
if (log != null) log.close();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
|
@ -8,21 +21,21 @@ import java.text.SimpleDateFormat;
|
|||
import com.ibm.text.utility.*;
|
||||
|
||||
public class GenerateData implements UCD_Types {
|
||||
|
||||
|
||||
public static void main (String[] args) throws IOException {
|
||||
System.out.println("START");
|
||||
ucd = UCD.make();
|
||||
System.out.println("Loaded UCD " + ucd.getVersion() + " " + (new Date(ucd.getDate())));
|
||||
String version = ucd.getVersion();
|
||||
|
||||
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
String arg = args[i];
|
||||
if (arg.charAt(0) == '#') return; // skip rest of line
|
||||
int mask = 0;
|
||||
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("Argument: " + args[i]);
|
||||
|
||||
|
||||
if (arg.equalsIgnoreCase("version")) {
|
||||
version = args[++i];
|
||||
ucd = UCD.make(version);
|
||||
|
@ -37,13 +50,13 @@ public class GenerateData implements UCD_Types {
|
|||
"DerivedBidiClass-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedNormalizationProperties")) {
|
||||
mask = Utility.setBits(0, DerivedPropertyLister.FC_NFKC_Closure, DerivedPropertyLister.ExpandsOnNFKC);
|
||||
mask = Utility.clearBit(mask, DerivedPropertyLister.FullCompInclusion);
|
||||
mask = Utility.clearBit(mask, DerivedPropertyLister.FullCompInclusion);
|
||||
generateDerived(mask, HEADER_DERIVED, "DerivedNormalizationProperties-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedEastAsianWidth")) {
|
||||
generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedEastAsianWidth-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedGeneralCategory")) {
|
||||
generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedGeneralCategory-" + version );
|
||||
} else if (arg.equalsIgnoreCase("DerivedCombiningClass")) {
|
||||
generateVerticalSlice(COMBINING_CLASS, COMBINING_CLASS+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
|
@ -91,30 +104,30 @@ public class GenerateData implements UCD_Types {
|
|||
System.out.println(" ! Unknown option -- must be one of the following (case-insensitive)");
|
||||
System.out.println(" ! generateCompExclusions,...");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F");
|
||||
//checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD");
|
||||
|
||||
|
||||
//generateDerived(Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf),
|
||||
|
||||
|
||||
//generateDerived(Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf),
|
||||
// HEADER_DERIVED, "DerivedPropData2-" + version );
|
||||
//generateVerticalSlice(SCRIPT, SCRIPT+1, KEEP_SPECIAL, "ScriptCommon-" + version );
|
||||
//listStrings("LowerCase-" + version , 0,0);
|
||||
//generateVerticalSlice(0, LIMIT_ENUM, SKIP_SPECIAL, PROPLIST1, "DerivedPropData1-" + version );
|
||||
|
||||
|
||||
// AGE stuff
|
||||
//UCD ucd = UCD.make();
|
||||
//System.out.println(ucd.getAgeID(0x61));
|
||||
//System.out.println(ucd.getAgeID(0x2FA1D));
|
||||
|
||||
|
||||
//
|
||||
}
|
||||
System.out.println("END");
|
||||
}
|
||||
|
||||
|
||||
static Normalizer nfkc = new Normalizer(Normalizer.NFKC);
|
||||
|
||||
|
||||
public static void checkHoffman(String test) {
|
||||
String result = nfkc.normalize(test);
|
||||
System.out.println(Utility.hex(test) + " => " + Utility.hex(result));
|
||||
|
@ -123,7 +136,7 @@ public class GenerateData implements UCD_Types {
|
|||
System.out.println();
|
||||
show(result, 0);
|
||||
}
|
||||
|
||||
|
||||
public static void show(String s, int indent) {
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
|
@ -137,16 +150,16 @@ public class GenerateData implements UCD_Types {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd' 'HH:mm:ss.S' GMT'");
|
||||
|
||||
|
||||
static {
|
||||
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
|
||||
}
|
||||
|
||||
|
||||
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
|
||||
|
||||
|
||||
public static String fixFile(String s) {
|
||||
int len = s.length();
|
||||
if (!s.endsWith(".txt")) return s;
|
||||
|
@ -156,9 +169,9 @@ public class GenerateData implements UCD_Types {
|
|||
System.out.println("Fixing File Name");
|
||||
return s.substring(0,len-6) + s.substring(len-4);
|
||||
}
|
||||
|
||||
|
||||
static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2;
|
||||
|
||||
|
||||
public static void doHeader(String fileName, PrintStream output, int headerChoice) {
|
||||
output.println("# " + fileName + ".txt");
|
||||
output.println("#");
|
||||
|
@ -179,7 +192,7 @@ public class GenerateData implements UCD_Types {
|
|||
output.println("# ================================================");
|
||||
output.println();
|
||||
}
|
||||
|
||||
|
||||
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
|
||||
ucd = UCD.make("310");
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + fileName));
|
||||
|
@ -194,13 +207,13 @@ public class GenerateData implements UCD_Types {
|
|||
}
|
||||
output.close();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
public static void listStrings(String file, int type, int subtype) throws IOException {
|
||||
ucd = UCD.make("310");
|
||||
UCD ucd30 = UCD.make("300");
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
|
||||
|
||||
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if ((i & 0xFFF) == 0) System.out.println("# " + i);
|
||||
if (!ucd.isRepresented(i)) continue;
|
||||
|
@ -215,17 +228,17 @@ public class GenerateData implements UCD_Types {
|
|||
output.close();
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
public static void generateCompExclusions() throws IOException {
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "CompositionExclusionsDelta.txt"));
|
||||
new CompLister(output).print();
|
||||
output.close();
|
||||
}
|
||||
|
||||
|
||||
static class CompLister extends PropertyLister {
|
||||
UCD oldUCD;
|
||||
int oldLength = 0;
|
||||
|
||||
|
||||
public CompLister(PrintStream output) {
|
||||
this.output = output;
|
||||
ucdData = UCD.make("310");
|
||||
|
@ -236,7 +249,7 @@ public class GenerateData implements UCD_Types {
|
|||
return UTF32.length32(ucdData.getDecompositionMapping(cp)) + "";
|
||||
}
|
||||
public byte status(int cp) {
|
||||
if (ucdData.getDecompositionType(cp) == CANONICAL
|
||||
if (ucdData.getDecompositionType(cp) == CANONICAL
|
||||
&& oldUCD.getDecompositionType(cp) != CANONICAL) {
|
||||
int temp = oldLength;
|
||||
oldLength = UTF32.length32(ucdData.getDecompositionMapping(cp));
|
||||
|
@ -246,11 +259,11 @@ public class GenerateData implements UCD_Types {
|
|||
return EXCLUDE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void partitionProperties() throws IOException {
|
||||
|
||||
|
||||
// find properties
|
||||
|
||||
|
||||
int count = 0;
|
||||
int[] props = new int[500];
|
||||
for (int i = 1; i < LIMIT_ENUM; ++i) { // || iType == SCRIPT
|
||||
|
@ -260,7 +273,7 @@ public class GenerateData implements UCD_Types {
|
|||
props[count++] = i;
|
||||
}
|
||||
System.out.println("props: " + count);
|
||||
|
||||
|
||||
BitSet probe = new BitSet();
|
||||
Map map = new HashMap();
|
||||
int total = 0;
|
||||
|
@ -269,12 +282,12 @@ public class GenerateData implements UCD_Types {
|
|||
int cat = ucd.getCategory(cp);
|
||||
if (cat == UNASSIGNED || cat == PRIVATE_USE || cat == SURROGATE) continue;
|
||||
if (!ucd.isAllocated(cp)) continue;
|
||||
|
||||
|
||||
for (int i = 0; i < count; ++i) {
|
||||
boolean iProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, props[i]);
|
||||
if (iProp) probe.set(i); else probe.clear(i);
|
||||
}
|
||||
|
||||
|
||||
++total;
|
||||
if (!map.containsKey(probe)) {
|
||||
map.put(probe.clone(), UTF32.valueOf32(cp));
|
||||
|
@ -282,27 +295,27 @@ public class GenerateData implements UCD_Types {
|
|||
System.out.println("Set Size: " + map.size() + ", total: " + total + ", " + ucd.getCodeAndName(cp));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("Set Size: " + map.size());
|
||||
}
|
||||
|
||||
|
||||
public static void listDifferences() throws IOException {
|
||||
|
||||
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "PropertyDifferences.txt"));
|
||||
|
||||
|
||||
for (int i = 1; i < LIMIT_ENUM; ++i) {
|
||||
int iType = i & 0xFF00;
|
||||
if (iType == JOINING_GROUP || iType == AGE || iType == COMBINING_CLASS || iType == SCRIPT) continue;
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
|
||||
String iNameShort = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.SHORT);
|
||||
String iNameLong = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.LONG);
|
||||
|
||||
|
||||
System.out.println();
|
||||
System.out.println();
|
||||
System.out.println(iNameLong);
|
||||
output.println("#" + iNameLong);
|
||||
|
||||
|
||||
int last = -1;
|
||||
for (int j = i+1; j < LIMIT_ENUM; ++j) {
|
||||
int jType = j & 0xFF00;
|
||||
|
@ -320,17 +333,17 @@ public class GenerateData implements UCD_Types {
|
|||
System.out.print('.');
|
||||
}
|
||||
System.out.flush();
|
||||
|
||||
|
||||
int bothCount = 0, i_jPropCount = 0, j_iPropCount = 0, iCount = 0, jCount = 0;
|
||||
|
||||
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
int cat = ucd.getCategory(cp);
|
||||
if (cat == UNASSIGNED || cat == PRIVATE_USE || cat == SURROGATE) continue;
|
||||
if (!ucd.isAllocated(cp)) continue;
|
||||
|
||||
|
||||
boolean iProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, i);
|
||||
boolean jProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, j);
|
||||
|
||||
|
||||
if (jProp) ++jCount;
|
||||
if (iProp) {
|
||||
++iCount;
|
||||
|
@ -339,21 +352,21 @@ public class GenerateData implements UCD_Types {
|
|||
} else if (jProp) ++j_iPropCount;
|
||||
}
|
||||
if (iCount == 0 || jCount == 0) continue;
|
||||
|
||||
|
||||
String jNameShort = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, j, MyPropertyLister.SHORT);
|
||||
//String jNameLong = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, j, MyPropertyLister.LONG);
|
||||
|
||||
|
||||
String rel = bothCount == 0 ? "DISJOINT"
|
||||
: i_jPropCount == 0 && j_iPropCount == 0 ? "EQUALS"
|
||||
: i_jPropCount == 0 ? "CONTAINS" // depends on reverse output
|
||||
: j_iPropCount == 0 ? "CONTAINS"
|
||||
: "OVERLAPS";
|
||||
|
||||
|
||||
if (j_iPropCount > i_jPropCount) {
|
||||
// reverse output
|
||||
output.println(jNameShort + "\t" + iNameShort + "\t" + rel
|
||||
+ "\t" + bothCount + "\t" + j_iPropCount + "\t" + i_jPropCount);
|
||||
} else {
|
||||
} else {
|
||||
output.println(iNameShort + "\t" + jNameShort + "\t" + rel
|
||||
+ "\t" + bothCount + "\t" + i_jPropCount + "\t" + j_iPropCount);
|
||||
}
|
||||
|
@ -361,8 +374,8 @@ public class GenerateData implements UCD_Types {
|
|||
}
|
||||
output.close();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public static void listProperties() {
|
||||
for (int i = 0; i < LIMIT_ENUM; ++i) {
|
||||
int type = i & 0xFF00;
|
||||
|
@ -373,7 +386,7 @@ public class GenerateData implements UCD_Types {
|
|||
else if (value.equals("<unused>")) continue;
|
||||
String abbvalue = MyPropertyLister.getUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.SHORT);
|
||||
if (abbvalue.length() == 0) abbvalue = "no";
|
||||
|
||||
|
||||
if (type == COMBINING_CLASS) {
|
||||
value = MyPropertyLister.getCombiningName(i);
|
||||
if (value.length() == 0) {
|
||||
|
@ -382,32 +395,32 @@ public class GenerateData implements UCD_Types {
|
|||
}
|
||||
abbvalue = value;
|
||||
}
|
||||
|
||||
|
||||
String elide = "";
|
||||
if (type == CATEGORY || type == SCRIPT || type == BINARY_PROPERTIES) elide = "\\p{"
|
||||
if (type == CATEGORY || type == SCRIPT || type == BINARY_PROPERTIES) elide = "\\p{"
|
||||
+ abbvalue
|
||||
+ "}";
|
||||
String abb = "";
|
||||
if (type != BINARY_PROPERTIES) abb = "\\p{"
|
||||
+ UCD_Names.ABB_UNIFIED_PROPERTIES[i>>8]
|
||||
if (type != BINARY_PROPERTIES) abb = "\\p{"
|
||||
+ UCD_Names.ABB_UNIFIED_PROPERTIES[i>>8]
|
||||
+ "="
|
||||
+ abbvalue
|
||||
+ "}";
|
||||
String norm = "";
|
||||
if (type != BINARY_PROPERTIES) norm = "\\p{"
|
||||
+ UCD_Names.SHORT_UNIFIED_PROPERTIES[i>>8]
|
||||
if (type != BINARY_PROPERTIES) norm = "\\p{"
|
||||
+ UCD_Names.SHORT_UNIFIED_PROPERTIES[i>>8]
|
||||
+ "="
|
||||
+ value
|
||||
+ "}";
|
||||
System.out.println("<tr><td>" + elide + "</td><td>" + abb + "</td><td>" + norm + "</td></tr>");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
|
||||
|
||||
public static void generateVerticalSlice(int startEnum, int endEnum, byte skipSpecial,
|
||||
|
||||
public static void generateVerticalSlice(int startEnum, int endEnum, byte skipSpecial,
|
||||
int headerChoice, String file) throws IOException {
|
||||
|
||||
|
||||
//System.out.println(ucd.toString(0x1E0A));
|
||||
/*
|
||||
System.out.println(ucd.getData(0xFFFF));
|
||||
|
@ -418,14 +431,14 @@ public class GenerateData implements UCD_Types {
|
|||
if (true) return;
|
||||
String test2 = ucd.getName(0x2A6D6);
|
||||
//*/
|
||||
|
||||
|
||||
|
||||
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file + "dX.txt"));
|
||||
doHeader(file, output, headerChoice);
|
||||
int last = -1;
|
||||
for (int i = startEnum; i < endEnum; ++i) {
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
|
||||
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|
||||
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|
||||
|| i == (BINARY_PROPERTIES | Non_break)
|
||||
|| i == (JOINING_TYPE | JT_U)
|
||||
|| i == (JOINING_GROUP | NO_SHAPING)
|
||||
|
@ -447,7 +460,7 @@ public class GenerateData implements UCD_Types {
|
|||
output.println();
|
||||
}
|
||||
System.out.print(".");
|
||||
new MyPropertyLister(ucd, i, output).print();
|
||||
new MyPropertyLister(ucd, i, output).print();
|
||||
}
|
||||
if (endEnum == LIMIT_ENUM) {
|
||||
output.println();
|
||||
|
@ -457,7 +470,7 @@ public class GenerateData implements UCD_Types {
|
|||
output.println();
|
||||
System.out.println();
|
||||
System.out.println("@NUMERIC VALUES");
|
||||
|
||||
|
||||
Set floatSet = new TreeSet();
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
float nv = ucd.getNumericValue(i);
|
||||
|
@ -474,21 +487,21 @@ public class GenerateData implements UCD_Types {
|
|||
output.close();
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
|
||||
static UCD ucd;
|
||||
|
||||
static public Normalizer formC, formD, formKC, formKD;
|
||||
|
||||
|
||||
static public void writeNormalizerTestSuite(String fileName) throws IOException {
|
||||
ucd = UCD.make();
|
||||
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter(fileName);
|
||||
|
||||
|
||||
formC = new Normalizer(Normalizer.NFC);
|
||||
formD = new Normalizer(Normalizer.NFD);
|
||||
formKC = new Normalizer(Normalizer.NFKC);
|
||||
formKD = new Normalizer(Normalizer.NFKD);
|
||||
|
||||
|
||||
String[] example = new String[256];
|
||||
|
||||
log.println("# " + fixFile(fileName));
|
||||
|
@ -522,24 +535,24 @@ public class GenerateData implements UCD_Types {
|
|||
log.println("# implementations:");
|
||||
log.println("#");
|
||||
log.println("# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)");
|
||||
|
||||
|
||||
System.out.println("Writing Part 1");
|
||||
|
||||
log.println("#");
|
||||
log.println("@Part0 # Specific cases");
|
||||
log.println("#");
|
||||
|
||||
|
||||
for (int j = 0; j < testSuiteCases.length; ++j) {
|
||||
writeLine(testSuiteCases[j], log, false);
|
||||
}
|
||||
|
||||
|
||||
System.out.println("Writing Part 2");
|
||||
|
||||
|
||||
log.println("#");
|
||||
log.println("@Part1 # Character by character test");
|
||||
log.println("# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.");
|
||||
log.println("#");
|
||||
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
if (!ucd.isAssigned(ch)) continue;
|
||||
|
@ -548,7 +561,7 @@ public class GenerateData implements UCD_Types {
|
|||
writeLine(cc,log, true);
|
||||
}
|
||||
Utility.fixDot();
|
||||
|
||||
|
||||
System.out.println("Finding Examples");
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
|
@ -558,24 +571,24 @@ public class GenerateData implements UCD_Types {
|
|||
int cc = ucd.getCombiningClass(ch);
|
||||
if (example[cc] == null) example[cc] = UTF32.valueOf32(ch);
|
||||
}
|
||||
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("Writing Part 2");
|
||||
|
||||
|
||||
log.println("#");
|
||||
log.println("@Part2 # Canonical Order Test");
|
||||
log.println("#");
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
|
||||
|
||||
Utility.dot(ch);
|
||||
if (!ucd.isAssigned(ch)) continue;
|
||||
if (ucd.isPUA(ch)) continue;
|
||||
short c = ucd.getCombiningClass(ch);
|
||||
if (c == 0) continue;
|
||||
|
||||
|
||||
// add character with higher class, same class, lower class
|
||||
|
||||
|
||||
String sample = "";
|
||||
for (int i = c+1; i < example.length; ++i) {
|
||||
if (example[i] == null) continue;
|
||||
|
@ -588,7 +601,7 @@ public class GenerateData implements UCD_Types {
|
|||
sample += example[i];
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
writeLine("a" + sample + UTF32.valueOf32(ch) + "b", log, false);
|
||||
writeLine("a" + UTF32.valueOf32(ch) + sample + "b", log, false);
|
||||
}
|
||||
|
@ -597,14 +610,14 @@ public class GenerateData implements UCD_Types {
|
|||
log.println("# END OF FILE");
|
||||
log.close();
|
||||
}
|
||||
|
||||
|
||||
static void writeLine(String cc, PrintWriter log, boolean check) {
|
||||
String c = formC.normalize(cc);
|
||||
String d = formD.normalize(cc);
|
||||
String kc = formKC.normalize(cc);
|
||||
String kd = formKD.normalize(cc);
|
||||
if (check & cc.equals(c) && cc.equals(d) && cc.equals(kc) && cc.equals(kd)) return;
|
||||
|
||||
|
||||
// consistency check
|
||||
String dc = formD.normalize(c);
|
||||
String dkc = formD.normalize(kc);
|
||||
|
@ -613,18 +626,18 @@ public class GenerateData implements UCD_Types {
|
|||
Normalizer.SHOW_PROGRESS = true;
|
||||
d = formD.normalize(cc);
|
||||
}
|
||||
|
||||
|
||||
// printout
|
||||
log.println(
|
||||
Utility.hex(cc," ") + ";" + Utility.hex(c," ") + ";" + Utility.hex(d," ") + ";"
|
||||
+ Utility.hex(kc," ") + ";" + Utility.hex(kd," ")
|
||||
+ "; # ("
|
||||
+ "; # ("
|
||||
+ comma(cc) + "; " + comma(c) + "; " + comma(d) + "; " + comma(kc) + "; " + comma(kd) + "; "
|
||||
+ ") " + ucd.getName(cc));
|
||||
}
|
||||
|
||||
|
||||
static StringBuffer commaResult = new StringBuffer();
|
||||
|
||||
|
||||
// not recursive!!!
|
||||
static final String comma(String s) {
|
||||
commaResult.setLength(0);
|
||||
|
@ -636,7 +649,7 @@ public class GenerateData implements UCD_Types {
|
|||
}
|
||||
return commaResult.toString();
|
||||
}
|
||||
|
||||
|
||||
static final String[] testSuiteCases = {
|
||||
"\u1E0A",
|
||||
"\u1E0C",
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MLStreamWriter.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.utility;
|
||||
|
||||
import java.io.*;
|
||||
|
|
|
@ -1,23 +1,36 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyFloatLister.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
|
||||
class MyFloatLister extends PropertyLister {
|
||||
private float propMask;
|
||||
|
||||
|
||||
public MyFloatLister(UCD ucd, float f, PrintStream output) {
|
||||
this.propMask = f;
|
||||
this.output = output;
|
||||
this.ucdData = ucd;
|
||||
}
|
||||
|
||||
|
||||
public String propertyName(int cp) {
|
||||
return ""+ucdData.getNumericValue(cp);
|
||||
}
|
||||
|
||||
|
||||
public String optionalName(int cp) {
|
||||
return ucdData.getNumericTypeID(cp);
|
||||
}
|
||||
|
||||
|
||||
public byte status(int cp) {
|
||||
//if ((cp & 0xFFF) == 0) System.out.println("# " + Utility.hex(cp));
|
||||
if (!ucdData.isRepresented(cp)) {
|
||||
|
@ -28,4 +41,4 @@ class MyFloatLister extends PropertyLister {
|
|||
return ucdData.getNumericValue(cp) == propMask ? INCLUDE : EXCLUDE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,21 +1,34 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyPropertyLister.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
final class MyPropertyLister extends PropertyLister {
|
||||
|
||||
|
||||
static final boolean BRIDGE = false;
|
||||
|
||||
|
||||
private int propMask;
|
||||
|
||||
|
||||
public MyPropertyLister(UCD ucd, int propMask, PrintStream output) {
|
||||
this.propMask = propMask;
|
||||
this.output = output;
|
||||
this.ucdData = ucd;
|
||||
if (propMask < COMBINING_CLASS) usePropertyComment = false; // skip gen cat
|
||||
}
|
||||
|
||||
|
||||
static String getCombiningName (int propMask) {
|
||||
String s = "";
|
||||
switch (propMask & 0xFF) {
|
||||
|
@ -46,7 +59,7 @@ final class MyPropertyLister extends PropertyLister {
|
|||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
public String headerString() {
|
||||
int main = (propMask & 0xFF00);
|
||||
if (main == COMBINING_CLASS) {
|
||||
|
@ -63,18 +76,18 @@ final class MyPropertyLister extends PropertyLister {
|
|||
return "# " + shortID + (shortID.equals(longID) ? "" : "\t(" + longID + ")");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public String propertyName(int cp) {
|
||||
return getUnifiedBinaryPropertyID(propMask);
|
||||
}
|
||||
|
||||
|
||||
public String optionalComment(int cp) {
|
||||
if (propMask < COMBINING_CLASS) return ""; // skip gen cat
|
||||
int cat = ucdData.getCategory(cp);
|
||||
if (cat == Lt || cat == Ll || cat == Lu) return "L&";
|
||||
return ucdData.getCategoryID(cp);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
public String optionalName(int cp) {
|
||||
if ((propMask & 0xFF00) == DECOMPOSITION_TYPE) {
|
||||
|
@ -84,7 +97,7 @@ final class MyPropertyLister extends PropertyLister {
|
|||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
public byte status(int cp) {
|
||||
//if (cp == 0xFFFF) {
|
||||
// System.out.println("# " + Utility.hex(cp));
|
||||
|
@ -93,7 +106,7 @@ final class MyPropertyLister extends PropertyLister {
|
|||
//if (cp == 0x0385) {
|
||||
// System.out.println(Utility.hex(firstRealCp));
|
||||
//}
|
||||
|
||||
|
||||
if (cat == Cn
|
||||
&& propMask != (BINARY_PROPERTIES | Noncharacter_Code_Point)
|
||||
&& propMask != (BINARY_PROPERTIES | Reserved_Cf_Code_Point)
|
||||
|
@ -101,7 +114,7 @@ final class MyPropertyLister extends PropertyLister {
|
|||
if (BRIDGE) return CONTINUE;
|
||||
else return EXCLUDE;
|
||||
}
|
||||
|
||||
|
||||
boolean inSet = getUnifiedBinaryProperty(cp, propMask);
|
||||
/*
|
||||
if (cp >= 0x1D400 && cp <= 0x1D7C9 && cat != Cn) {
|
||||
|
@ -119,7 +132,7 @@ final class MyPropertyLister extends PropertyLister {
|
|||
if (!inSet) return EXCLUDE;
|
||||
return INCLUDE;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return unified property number
|
||||
*/
|
||||
|
@ -141,12 +154,12 @@ final class MyPropertyLister extends PropertyLister {
|
|||
case AGE>>8: return propMask < LIMIT_AGE;
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public boolean getUnifiedBinaryProperty(int cp, int propMask) {
|
||||
return getUnifiedBinaryProperty(ucdData, cp, propMask);
|
||||
}
|
||||
|
||||
|
||||
static public boolean getUnifiedBinaryProperty(UCD ucd, int cp, int propMask) {
|
||||
int enum = propMask >> 8;
|
||||
propMask &= 0xFF;
|
||||
|
@ -177,21 +190,21 @@ final class MyPropertyLister extends PropertyLister {
|
|||
return ucd.getAge(cp) == propMask;
|
||||
}
|
||||
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static final int SHORT = -1, NORMAL = 0, LONG = 1, BOTH = 2;
|
||||
|
||||
|
||||
public String getUnifiedBinaryPropertyID(int unifiedPropMask) {
|
||||
return getUnifiedBinaryPropertyID(ucdData, unifiedPropMask, NORMAL);
|
||||
}
|
||||
|
||||
|
||||
public static String getUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask) {
|
||||
String longOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, LONG);
|
||||
String shortOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, SHORT);
|
||||
if (longOne.equals(shortOne)) return longOne;
|
||||
return shortOne + "(" + longOne + ")";
|
||||
}
|
||||
|
||||
|
||||
public static String getFullUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask, int style) {
|
||||
String pre = "";
|
||||
if ((unifiedPropMask & 0xFF00) != BINARY_PROPERTIES) {
|
||||
|
@ -205,7 +218,7 @@ final class MyPropertyLister extends PropertyLister {
|
|||
if (shortOne.length() == 0) shortOne = "xx";
|
||||
String longOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, LONG);
|
||||
if (longOne.length() == 0) longOne = "none";
|
||||
|
||||
|
||||
String post;
|
||||
if (style < LONG) post = shortOne;
|
||||
else if (style == LONG || shortOne.equals(longOne)) post = longOne;
|
||||
|
@ -215,10 +228,10 @@ final class MyPropertyLister extends PropertyLister {
|
|||
pre = post + "=";
|
||||
post = "T";
|
||||
}
|
||||
|
||||
|
||||
return pre + post;
|
||||
}
|
||||
|
||||
|
||||
public static String getUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask, int style) {
|
||||
int enum = unifiedPropMask >> 8;
|
||||
byte propMask = (byte)unifiedPropMask;
|
||||
|
@ -264,7 +277,7 @@ final class MyPropertyLister extends PropertyLister {
|
|||
return ucd.getAgeID_fromIndex(propMask);
|
||||
}
|
||||
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
|
@ -16,13 +29,13 @@ import com.ibm.text.utility.*;
|
|||
* in connection with or arising out of the use of the information here.
|
||||
* @author Mark Davis
|
||||
*/
|
||||
|
||||
|
||||
public final class Normalizer implements UCD_Types {
|
||||
public static final String copyright =
|
||||
public static final String copyright =
|
||||
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
|
||||
|
||||
|
||||
public static boolean SHOW_PROGRESS = false;
|
||||
|
||||
|
||||
/**
|
||||
* Create a normalizer for a given form.
|
||||
*/
|
||||
|
@ -31,41 +44,41 @@ public final class Normalizer implements UCD_Types {
|
|||
this.compatibility = (form & COMPATIBILITY_MASK) != 0;
|
||||
this.data = getData(unicodeVersion);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create a normalizer for a given form.
|
||||
*/
|
||||
public Normalizer(byte form) {
|
||||
this(form,"");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Masks for the form selector
|
||||
*/
|
||||
public static final byte
|
||||
public static final byte
|
||||
COMPATIBILITY_MASK = 1,
|
||||
COMPOSITION_MASK = 2;
|
||||
|
||||
|
||||
/**
|
||||
* Normalization Form Selector
|
||||
*/
|
||||
public static final byte
|
||||
NFD = 0 ,
|
||||
public static final byte
|
||||
NFD = 0 ,
|
||||
NFKD = COMPATIBILITY_MASK,
|
||||
NFC = COMPOSITION_MASK,
|
||||
NFKC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
|
||||
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form,
|
||||
* Normalizes text according to the chosen form,
|
||||
* replacing contents of the target buffer.
|
||||
* @param source the original text, unnormalized
|
||||
* @param target the resulting normalized text
|
||||
*/
|
||||
public StringBuffer normalize(String source, StringBuffer target) {
|
||||
|
||||
|
||||
// First decompose the source into target,
|
||||
// then compose if the form requires.
|
||||
|
||||
|
||||
if (source.length() != 0) {
|
||||
internalDecompose(source, target);
|
||||
if (composition) {
|
||||
|
@ -83,7 +96,7 @@ public final class Normalizer implements UCD_Types {
|
|||
public String normalize(String source) {
|
||||
return normalize(source, new StringBuffer()).toString();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form
|
||||
* @param source the original text, unnormalized
|
||||
|
@ -92,18 +105,18 @@ public final class Normalizer implements UCD_Types {
|
|||
public String normalize(int cp) {
|
||||
return normalize(UTF16.valueOf(cp));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*/
|
||||
private StringBuffer hasDecompositionBuffer = new StringBuffer();
|
||||
|
||||
|
||||
public boolean hasDecomposition(int cp) {
|
||||
hasDecompositionBuffer.setLength(0);
|
||||
normalize(UTF16.valueOf(cp), hasDecompositionBuffer);
|
||||
if (hasDecompositionBuffer.length() != 1) return true;
|
||||
return cp != hasDecompositionBuffer.charAt(0);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Does a quick check to see if the string is in the current form. Checks canonical order and
|
||||
* isAllowed().
|
||||
|
@ -112,7 +125,7 @@ public final class Normalizer implements UCD_Types {
|
|||
*/
|
||||
/*
|
||||
public static final int NO = 0, YES = 1, MAYBE = -1;
|
||||
|
||||
|
||||
public int quickCheck(String source) {
|
||||
short lastCanonicalClass = 0;
|
||||
int result = YES;
|
||||
|
@ -128,7 +141,7 @@ public final class Normalizer implements UCD_Types {
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Find whether the given character is allowed in the current form.
|
||||
* @return YES, NO, MAYBE
|
||||
|
@ -153,7 +166,7 @@ public final class Normalizer implements UCD_Types {
|
|||
}
|
||||
return YES;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Utility: Gets the combining class of a character from the
|
||||
* Unicode Character Database. Only a byte is needed, but since they are signed in Java
|
||||
|
@ -161,13 +174,13 @@ public final class Normalizer implements UCD_Types {
|
|||
* @param ch the source character
|
||||
* @return value from 0 to 255
|
||||
*/
|
||||
|
||||
|
||||
public short getCanonicalClass(char ch) {
|
||||
return data.getCanonicalClass(ch);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Utility: Checks whether there is a recursive decomposition of a character from the
|
||||
* Utility: Checks whether there is a recursive decomposition of a character from the
|
||||
* Unicode Character Database. It is compatibility or canonical according to the particular
|
||||
* normalizer.
|
||||
* @param ch the source character
|
||||
|
@ -175,11 +188,11 @@ public final class Normalizer implements UCD_Types {
|
|||
public boolean normalizationDiffers(int ch) {
|
||||
return data.normalizationDiffers(ch, composition, compatibility);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Utility: Gets recursive decomposition of a character from the
|
||||
* Utility: Gets recursive decomposition of a character from the
|
||||
* Unicode Character Database.
|
||||
* @param compatibility If false selects the recursive
|
||||
* @param compatibility If false selects the recursive
|
||||
* canonical decomposition, otherwise selects
|
||||
* the recursive compatibility AND canonical decomposition.
|
||||
* @param ch the source character
|
||||
|
@ -188,7 +201,7 @@ public final class Normalizer implements UCD_Types {
|
|||
public void getRecursiveDecomposition(char ch, StringBuffer buffer) {
|
||||
data.getRecursiveDecomposition(ch, buffer, compatibility);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Utility: Gets composition mapping.
|
||||
* @return IntEnumeration with the pair -> value mapping, where the
|
||||
|
@ -199,18 +212,18 @@ public final class Normalizer implements UCD_Types {
|
|||
public IntHashtable.IntEnumeration getComposition() {
|
||||
return data.getComposition();
|
||||
}
|
||||
|
||||
|
||||
*/
|
||||
|
||||
|
||||
public boolean isTrailing(int cp) {
|
||||
return this.composition ? data.isTrailing(cp) : false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// ======================================
|
||||
// PRIVATES
|
||||
// ======================================
|
||||
|
||||
|
||||
/**
|
||||
* The current form.
|
||||
*/
|
||||
|
@ -221,7 +234,7 @@ public final class Normalizer implements UCD_Types {
|
|||
* Decomposes text, either canonical or compatibility,
|
||||
* replacing contents of the target buffer.
|
||||
* @param form the normalization form. If COMPATIBILITY_MASK
|
||||
* bit is on in this byte, then selects the recursive
|
||||
* bit is on in this byte, then selects the recursive
|
||||
* compatibility decomposition, otherwise selects
|
||||
* the recursive canonical decomposition.
|
||||
* @param source the original text, unnormalized
|
||||
|
@ -234,20 +247,20 @@ public final class Normalizer implements UCD_Types {
|
|||
buffer.setLength(0);
|
||||
ch32 = UTF16.charAt(source, i);
|
||||
data.getRecursiveDecomposition(ch32, buffer, compatibility);
|
||||
|
||||
|
||||
// add all of the characters in the decomposition.
|
||||
// (may be just the original character, if there was
|
||||
// no decomposition mapping)
|
||||
|
||||
|
||||
int ch;
|
||||
for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
|
||||
ch = UTF16Plus.charAt(buffer, j);
|
||||
int chClass = data.getCanonicalClass(ch);
|
||||
int k = target.length(); // insertion point
|
||||
if (chClass != 0) {
|
||||
|
||||
|
||||
// bubble-sort combining marks as necessary
|
||||
|
||||
|
||||
int ch2;
|
||||
for (; k > 0; k -= UTF16.getCharCount(ch2)) {
|
||||
ch2 = UTF16Plus.charAt(target, k-1);
|
||||
|
@ -273,9 +286,9 @@ public final class Normalizer implements UCD_Types {
|
|||
int lastClass = data.getCanonicalClass(starterCh);
|
||||
if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
|
||||
int oldLen = target.length();
|
||||
|
||||
|
||||
// Loop on the decomposed characters, combining where possible
|
||||
|
||||
|
||||
int ch;
|
||||
for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
|
||||
ch = UTF16Plus.charAt(target, decompPos);
|
||||
|
@ -317,7 +330,7 @@ public final class Normalizer implements UCD_Types {
|
|||
private BitSet canonicalRecompose = new BitSet();
|
||||
private BitSet compatibilityRecompose = new BitSet();
|
||||
static final int NOT_COMPOSITE = 0xFFFF;
|
||||
|
||||
|
||||
Stub(String version) {
|
||||
ucd = UCD.make(version);
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
|
@ -336,23 +349,23 @@ public final class Normalizer implements UCD_Types {
|
|||
}
|
||||
int a = UTF16.charAt(s, 0);
|
||||
if (ucd.getCombiningClass(a) != 0) continue;
|
||||
|
||||
|
||||
int b = UTF16.charAt(s, UTF16.getCharCount(a));
|
||||
isSecond.set(b);
|
||||
|
||||
|
||||
// have a recomposition, so set the bit
|
||||
canonicalRecompose.set(i);
|
||||
|
||||
// set the compatibility recomposition bit
|
||||
|
||||
// set the compatibility recomposition bit
|
||||
// ONLY if the component characters
|
||||
// don't compatibility decompose
|
||||
if (ucd.getDecompositionType(a) <= CANONICAL
|
||||
&& ucd.getDecompositionType(b) <= CANONICAL) {
|
||||
compatibilityRecompose.set(i);
|
||||
}
|
||||
|
||||
|
||||
long key = (((long)a)<<32) | b;
|
||||
|
||||
|
||||
/*if (i == '\u1E0A' || key == 0x004400000307) {
|
||||
System.out.println(Utility.hex(s));
|
||||
System.out.println(Utility.hex(i));
|
||||
|
@ -392,15 +405,15 @@ Problem: differs: true, call: false U+1FDE GREEK DASIA AND OXIA
|
|||
Problem: differs: true, call: false U+1FDF GREEK DASIA AND PERISPOMENI
|
||||
Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
|
||||
*/
|
||||
|
||||
|
||||
short getCanonicalClass(int cp) {
|
||||
return ucd.getCombiningClass(cp);
|
||||
}
|
||||
|
||||
|
||||
boolean isTrailing(int cp) {
|
||||
return isSecond.get(cp);
|
||||
}
|
||||
|
||||
|
||||
boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) {
|
||||
byte dt = ucd.getDecompositionType(cp);
|
||||
if (!composition) {
|
||||
|
@ -413,7 +426,7 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
|
|||
else return dt == CANONICAL && !canonicalRecompose.get(cp);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compatibility) {
|
||||
byte dt = ucd.getDecompositionType(cp);
|
||||
// we know we decompose all CANONICAL, plus > CANONICAL if compatibility is TRUE.
|
||||
|
@ -427,7 +440,7 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
|
|||
UTF16.append(buffer, cp);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int getPairwiseComposition(int starterCh, int ch) {
|
||||
int hangulPoss = UCD.composeHangul(starterCh, ch);
|
||||
if (hangulPoss != 0xFFFF) return hangulPoss;
|
||||
|
@ -435,17 +448,17 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
|
|||
if (obj == null) return 0xFFFF;
|
||||
return ((Integer)obj).intValue();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Contains normalization data from the Unicode Character Database.
|
||||
* use false for the minimal set, true for the real set.
|
||||
* use false for the minimal set, true for the real set.
|
||||
*/
|
||||
private Stub data;
|
||||
|
||||
|
||||
private static HashMap versionCache = new HashMap();
|
||||
|
||||
|
||||
private static Stub getData (String version) {
|
||||
if (version.length() == 0) version = UCD.latestVersion;
|
||||
Stub result = (Stub)versionCache.get(version);
|
||||
|
@ -455,7 +468,7 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Just accessible for testing.
|
||||
*/
|
||||
|
@ -463,7 +476,7 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
|
|||
boolean isExcluded (char ch) {
|
||||
return data.isExcluded(ch);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Just accessible for testing.
|
||||
*/
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/PropertyLister.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.*;
|
||||
|
@ -5,11 +18,11 @@ import com.ibm.text.utility.*;
|
|||
|
||||
|
||||
abstract public class PropertyLister implements UCD_Types {
|
||||
|
||||
|
||||
static final boolean COMPRESS_NAMES = false;
|
||||
static final boolean DROP_INDICATORS = true;
|
||||
|
||||
|
||||
|
||||
|
||||
protected UCD ucdData;
|
||||
protected PrintStream output;
|
||||
protected boolean showOnConsole;
|
||||
|
@ -17,37 +30,37 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
protected int firstRealCp = -2;
|
||||
protected int lastRealCp = -2;
|
||||
protected boolean alwaysBreaks = false; // set to true if property only breaks
|
||||
|
||||
|
||||
public static final byte INCLUDE = 0, BREAK = 1, CONTINUE = 2, EXCLUDE = 3;
|
||||
|
||||
/**
|
||||
|
||||
/**
|
||||
* @return status. Also have access to firstRealCp, lastRealCp
|
||||
*/
|
||||
abstract public byte status(int cp);
|
||||
|
||||
|
||||
public String headerString() {
|
||||
return "";
|
||||
}
|
||||
|
||||
|
||||
public String propertyName(int cp) {
|
||||
return "";
|
||||
}
|
||||
|
||||
|
||||
public String optionalName(int cp) {
|
||||
return "";
|
||||
}
|
||||
|
||||
|
||||
public String optionalComment(int cp) {
|
||||
if (!usePropertyComment) return "";
|
||||
int cat = ucdData.getCategory(cp);
|
||||
if (cat == Lt || cat == Ll || cat == Lu) return "L&";
|
||||
return ucdData.getCategoryID(cp);
|
||||
}
|
||||
|
||||
|
||||
public int minPropertyWidth() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
public void format(int startCp, int endCp, int realCount) {
|
||||
try {
|
||||
String prop = propertyName(startCp);
|
||||
|
@ -65,7 +78,7 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
String count = (bridge == 0) ? "" + realCount : realCount + "/" + bridge;
|
||||
String countStr = Utility.repeat(" ", 3-count.length()) + "[" + count + "] ";
|
||||
String gap = Utility.repeat(" ", 12 - width(startCp) - width(endCp));
|
||||
|
||||
|
||||
line = Utility.hex(startCp,4) + ".." + Utility.hex(endCp,4) + gap
|
||||
+ prop + opt + pgap + " # " + optCom
|
||||
+ countStr;
|
||||
|
@ -75,7 +88,7 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
if (com == 0) {
|
||||
line += startName + ".." + endName;
|
||||
} else {
|
||||
line += startName.substring(0,com)
|
||||
line += startName.substring(0,com)
|
||||
+ "(" + startName.substring(com) + ".." + endName.substring(com) + ")";
|
||||
}
|
||||
}
|
||||
|
@ -93,17 +106,17 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
output.println(line);
|
||||
if (showOnConsole) System.out.println(line);
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("Format error {0}, {1}",
|
||||
throw new ChainException("Format error {0}, {1}",
|
||||
new Object[]{new Integer(startCp), new Integer(endCp)}, e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int width(int cp) {
|
||||
return cp <= 0xFFFF ? 4
|
||||
: cp <= 0xFFFFF ? 5
|
||||
return cp <= 0xFFFF ? 4
|
||||
: cp <= 0xFFFFF ? 5
|
||||
: 6;
|
||||
}
|
||||
|
||||
|
||||
String getKenName(int cp) {
|
||||
String result = ucdData.getName(cp);
|
||||
if (result == null) return "";
|
||||
|
@ -113,8 +126,8 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @return common initial substring length ending with SPACE or HYPHEN-MINUS. 0 if there is none
|
||||
*/
|
||||
|
@ -136,14 +149,14 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
}
|
||||
return lastSpace;
|
||||
}
|
||||
|
||||
|
||||
public int print() {
|
||||
int count = 0;
|
||||
firstRealCp = -1;
|
||||
byte firstRealCpCat = -1;
|
||||
lastRealCp = -1;
|
||||
int realRangeCount = 0;
|
||||
|
||||
|
||||
String header = headerString();
|
||||
if (header.length() != 0) {
|
||||
output.println(header);
|
||||
|
@ -156,7 +169,7 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
if (cat == Lt || cat == Ll) cat = Lu;
|
||||
if (cat != firstRealCpCat) s = BREAK;
|
||||
}
|
||||
|
||||
|
||||
switch(s) {
|
||||
case CONTINUE:
|
||||
break; // do nothing
|
||||
|
@ -177,7 +190,7 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
lastRealCp = firstRealCp = cp;
|
||||
firstRealCpCat = ucdData.getCategory(firstRealCp);
|
||||
if (firstRealCpCat == Lt || firstRealCpCat == Ll) firstRealCpCat = Lu;
|
||||
|
||||
|
||||
realRangeCount = 1;
|
||||
count++;
|
||||
break;
|
||||
|
@ -193,7 +206,7 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
if (firstRealCp != -1) {
|
||||
format(firstRealCp, lastRealCp, realRangeCount);
|
||||
}
|
||||
|
||||
|
||||
if (count == 0) System.out.println("WARNING -- ZERO COUNT FOR " + header);
|
||||
output.println();
|
||||
output.println("# Total code points: " + count);
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
|
@ -8,31 +21,31 @@ import java.text.SimpleDateFormat;
|
|||
import com.ibm.text.utility.*;
|
||||
|
||||
public class TestData implements UCD_Types {
|
||||
|
||||
|
||||
public static void main (String[] args) throws IOException {
|
||||
System.out.println("START");
|
||||
ucd = UCD.make();
|
||||
System.out.println("Loaded UCD " + ucd.getVersion() + " " + (new Date(ucd.getDate())));
|
||||
|
||||
|
||||
checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F");
|
||||
checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD");
|
||||
|
||||
|
||||
int mask = 0;
|
||||
|
||||
|
||||
if (false) {
|
||||
|
||||
|
||||
generateVerticalSlice(BIDI_CLASS, BIDI_CLASS+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedBidiClass-3.1.1d1.txt");
|
||||
|
||||
|
||||
|
||||
|
||||
mask = Utility.setBits(0, DerivedPropertyLister.FC_NFKC_Closure, DerivedPropertyLister.ExpandsOnNFKC);
|
||||
mask = Utility.clearBit(mask, DerivedPropertyLister.FullCompInclusion);
|
||||
mask = Utility.clearBit(mask, DerivedPropertyLister.FullCompInclusion);
|
||||
generateDerived(mask, HEADER_DERIVED, "DerivedNormalizationProperties-3.1.0d1.txt");
|
||||
|
||||
generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedEastAsianWidth-3.1.0d1.txt");
|
||||
|
||||
generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
|
||||
generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedGeneralCategory-3.1.0d1.txt");
|
||||
generateVerticalSlice(COMBINING_CLASS, COMBINING_CLASS+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedCombiningClass-3.1.0d1.txt");
|
||||
|
@ -53,41 +66,41 @@ public class TestData implements UCD_Types {
|
|||
|
||||
mask = Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf);
|
||||
generateDerived(mask, HEADER_DERIVED, "DerivedCoreProperties-3.1.0d1.txt");
|
||||
|
||||
|
||||
generateVerticalSlice(LINE_BREAK, LINE_BREAK+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
|
||||
"DerivedLineBreak-3.1.0d1.txt");
|
||||
|
||||
|
||||
generateVerticalSlice(SCRIPT+1, SCRIPT + NEXT_ENUM, KEEP_SPECIAL, HEADER_SCRIPTS, "Scripts-3.1.0d4.txt");
|
||||
|
||||
|
||||
generateVerticalSlice(BINARY_PROPERTIES + White_space, BINARY_PROPERTIES + Noncharacter_Code_Point + 1,
|
||||
KEEP_SPECIAL, HEADER_EXTEND, "PropList-3.1.0d5.txt");
|
||||
|
||||
|
||||
|
||||
|
||||
writeNormalizerTestSuite("NormalizationTest-3.1.0d1.txt");
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//generateDerived(Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf),
|
||||
|
||||
|
||||
|
||||
//generateDerived(Utility.setBits(0, DerivedPropertyLister.PropMath, DerivedPropertyLister.Mod_ID_Continue_NO_Cf),
|
||||
// HEADER_DERIVED, "DerivedPropData2-3.1.0d1.txt");
|
||||
//generateVerticalSlice(SCRIPT, SCRIPT+1, KEEP_SPECIAL, "ScriptCommon-3.1.0d1.txt");
|
||||
//listStrings("LowerCase-3.1.0d1.txt", 0,0);
|
||||
//generateVerticalSlice(0, LIMIT_ENUM, SKIP_SPECIAL, PROPLIST1, "DerivedPropData1-3.1.0d1.txt");
|
||||
|
||||
|
||||
// AGE stuff
|
||||
//UCD ucd = UCD.make();
|
||||
//System.out.println(ucd.getAgeID(0x61));
|
||||
//System.out.println(ucd.getAgeID(0x2FA1D));
|
||||
|
||||
|
||||
|
||||
|
||||
//generateCompExclusions();
|
||||
System.out.println("END");
|
||||
}
|
||||
|
||||
|
||||
static Normalizer nfkc = new Normalizer(Normalizer.NFKC);
|
||||
|
||||
|
||||
public static void checkHoffman(String test) {
|
||||
String result = nfkc.normalize(test);
|
||||
System.out.println(Utility.hex(test) + " => " + Utility.hex(result));
|
||||
|
@ -96,7 +109,7 @@ public class TestData implements UCD_Types {
|
|||
System.out.println();
|
||||
show(result, 0);
|
||||
}
|
||||
|
||||
|
||||
public static void show(String s, int indent) {
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
|
@ -110,16 +123,16 @@ public class TestData implements UCD_Types {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd' 'HH:mm:ss.S' GMT'");
|
||||
|
||||
|
||||
static {
|
||||
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
|
||||
}
|
||||
|
||||
|
||||
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
|
||||
|
||||
|
||||
public static String fixFile(String s) {
|
||||
int len = s.length();
|
||||
if (!s.endsWith(".txt")) return s;
|
||||
|
@ -129,9 +142,9 @@ public class TestData implements UCD_Types {
|
|||
System.out.println("Fixing File Name");
|
||||
return s.substring(0,len-6) + s.substring(len-4);
|
||||
}
|
||||
|
||||
|
||||
static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2;
|
||||
|
||||
|
||||
public static void doHeader(String fileName, PrintStream output, int headerChoice) {
|
||||
output.println("# " + fixFile(fileName));
|
||||
output.println("#");
|
||||
|
@ -152,7 +165,7 @@ public class TestData implements UCD_Types {
|
|||
output.println("# ================================================");
|
||||
output.println();
|
||||
}
|
||||
|
||||
|
||||
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
|
||||
ucd = UCD.make("310");
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + fileName));
|
||||
|
@ -167,13 +180,13 @@ public class TestData implements UCD_Types {
|
|||
}
|
||||
output.close();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
public static void listStrings(String file, int type, int subtype) throws IOException {
|
||||
ucd = UCD.make("310");
|
||||
UCD ucd30 = UCD.make("300");
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
|
||||
|
||||
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if ((i & 0xFFF) == 0) System.out.println("# " + i);
|
||||
if (!ucd.isRepresented(i)) continue;
|
||||
|
@ -188,17 +201,17 @@ public class TestData implements UCD_Types {
|
|||
output.close();
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
public static void generateCompExclusions() throws IOException {
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + "CompositionExclusionsDelta.txt"));
|
||||
new CompLister(output).print();
|
||||
output.close();
|
||||
}
|
||||
|
||||
|
||||
static class CompLister extends PropertyLister {
|
||||
UCD oldUCD;
|
||||
int oldLength = 0;
|
||||
|
||||
|
||||
public CompLister(PrintStream output) {
|
||||
this.output = output;
|
||||
ucdData = UCD.make("310");
|
||||
|
@ -209,7 +222,7 @@ public class TestData implements UCD_Types {
|
|||
return UTF32.length32(ucdData.getDecompositionMapping(cp)) + "";
|
||||
}
|
||||
public byte status(int cp) {
|
||||
if (ucdData.getDecompositionType(cp) == CANONICAL
|
||||
if (ucdData.getDecompositionType(cp) == CANONICAL
|
||||
&& oldUCD.getDecompositionType(cp) != CANONICAL) {
|
||||
int temp = oldLength;
|
||||
oldLength = UTF32.length32(ucdData.getDecompositionMapping(cp));
|
||||
|
@ -219,11 +232,11 @@ public class TestData implements UCD_Types {
|
|||
return EXCLUDE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
|
||||
|
||||
|
||||
public static void generateVerticalSlice(int startEnum, int endEnum, byte skipSpecial, int headerChoice, String file) throws IOException {
|
||||
|
||||
|
||||
//System.out.println(ucd.toString(0x1E0A));
|
||||
/*
|
||||
System.out.println(ucd.getData(0xFFFF));
|
||||
|
@ -234,14 +247,14 @@ public class TestData implements UCD_Types {
|
|||
if (true) return;
|
||||
String test2 = ucd.getName(0x2A6D6);
|
||||
//*/
|
||||
|
||||
|
||||
|
||||
|
||||
PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + file));
|
||||
doHeader(file, output, headerChoice);
|
||||
int last = -1;
|
||||
for (int i = startEnum; i < endEnum; ++i) {
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
|
||||
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|
||||
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|
||||
|| i == (CATEGORY | UNUSED_CATEGORY)
|
||||
|| i == (BINARY_PROPERTIES | Non_break)
|
||||
|| i == (JOINING_TYPE | JT_U)
|
||||
|
@ -265,7 +278,7 @@ public class TestData implements UCD_Types {
|
|||
output.println();
|
||||
}
|
||||
System.out.print(".");
|
||||
new MyPropertyLister(ucd, i, output).print();
|
||||
new MyPropertyLister(ucd, i, output).print();
|
||||
}
|
||||
if (endEnum == LIMIT_ENUM) {
|
||||
output.println();
|
||||
|
@ -275,7 +288,7 @@ public class TestData implements UCD_Types {
|
|||
output.println();
|
||||
System.out.println();
|
||||
System.out.println("@NUMERIC VALUES");
|
||||
|
||||
|
||||
Set floatSet = new TreeSet();
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
float nv = ucd.getNumericValue(i);
|
||||
|
@ -292,13 +305,13 @@ public class TestData implements UCD_Types {
|
|||
output.close();
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
|
||||
static UCD ucd;
|
||||
|
||||
static public Normalizer formC, formD, formKC, formKD;
|
||||
|
||||
|
||||
static public void writeNormalizerTestSuite(String fileName) throws IOException {
|
||||
|
||||
|
||||
PrintWriter log = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
|
@ -309,7 +322,7 @@ public class TestData implements UCD_Types {
|
|||
formD = new Normalizer(Normalizer.NFD);
|
||||
formKC = new Normalizer(Normalizer.NFKC);
|
||||
formKD = new Normalizer(Normalizer.NFKD);
|
||||
|
||||
|
||||
log.println("# " + fixFile(fileName));
|
||||
log.println("#");
|
||||
log.println("# Normalization Test Suite");
|
||||
|
@ -341,24 +354,24 @@ public class TestData implements UCD_Types {
|
|||
log.println("# implementations:");
|
||||
log.println("#");
|
||||
log.println("# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)");
|
||||
|
||||
|
||||
System.out.println("Writing Part 1");
|
||||
|
||||
log.println("#");
|
||||
log.println("@Part0 # Specific cases");
|
||||
log.println("#");
|
||||
|
||||
|
||||
for (int j = 0; j < testSuiteCases.length; ++j) {
|
||||
writeLine(testSuiteCases[j], log, false);
|
||||
}
|
||||
|
||||
|
||||
System.out.println("Writing Part 2");
|
||||
|
||||
|
||||
log.println("#");
|
||||
log.println("@Part1 # Character by character test");
|
||||
log.println("# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.");
|
||||
log.println("#");
|
||||
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
if (!ucd.isAssigned(ch)) continue;
|
||||
|
@ -367,7 +380,7 @@ public class TestData implements UCD_Types {
|
|||
writeLine(cc,log, true);
|
||||
}
|
||||
Utility.fixDot();
|
||||
|
||||
|
||||
System.out.println("Finding Examples");
|
||||
|
||||
String[] example = new String[256];
|
||||
|
@ -379,7 +392,7 @@ public class TestData implements UCD_Types {
|
|||
int cc = ucd.getCombiningClass(ch);
|
||||
if (example[cc] == null) example[cc] = UTF32.valueOf32(ch);
|
||||
}
|
||||
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("Writing Part 3");
|
||||
|
||||
|
@ -393,9 +406,9 @@ public class TestData implements UCD_Types {
|
|||
if (ucd.isPUA(ch)) continue;
|
||||
short c = ucd.getCombiningClass(ch);
|
||||
if (c == 0) continue;
|
||||
|
||||
|
||||
// add character with higher class, same class, lower class
|
||||
|
||||
|
||||
String sample = "";
|
||||
for (int i = c+1; i < example.length; ++i) {
|
||||
if (example[i] == null) continue;
|
||||
|
@ -408,7 +421,7 @@ public class TestData implements UCD_Types {
|
|||
sample += example[i];
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
writeLine("a" + sample + UTF32.valueOf32(ch) + "b", log, false);
|
||||
writeLine("a" + UTF32.valueOf32(ch) + sample + "b", log, false);
|
||||
}
|
||||
|
@ -417,7 +430,7 @@ public class TestData implements UCD_Types {
|
|||
log.println("# END OF FILE");
|
||||
log.close();
|
||||
}
|
||||
|
||||
|
||||
static void writeLine(String cc, PrintWriter log, boolean check) {
|
||||
String c = formC.normalize(cc);
|
||||
String d = formD.normalize(cc);
|
||||
|
@ -427,13 +440,13 @@ public class TestData implements UCD_Types {
|
|||
log.println(
|
||||
Utility.hex(cc," ") + ";" + Utility.hex(c," ") + ";" + Utility.hex(d," ") + ";"
|
||||
+ Utility.hex(kc," ") + ";" + Utility.hex(kd," ")
|
||||
+ "; # ("
|
||||
+ "; # ("
|
||||
+ comma(cc) + "; " + comma(c) + "; " + comma(d) + "; " + comma(kc) + "; " + comma(kd) + "; "
|
||||
+ ") " + ucd.getName(cc));
|
||||
}
|
||||
|
||||
|
||||
static StringBuffer commaResult = new StringBuffer();
|
||||
|
||||
|
||||
// not recursive!!!
|
||||
static final String comma(String s) {
|
||||
commaResult.setLength(0);
|
||||
|
@ -445,7 +458,7 @@ public class TestData implements UCD_Types {
|
|||
}
|
||||
return commaResult.toString();
|
||||
}
|
||||
|
||||
|
||||
static final String[] testSuiteCases = {
|
||||
"\u1E0A",
|
||||
"\u1E0C",
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNormalization.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
|
@ -8,56 +21,56 @@ import com.ibm.text.utility.*;
|
|||
public final class TestNormalization {
|
||||
static final String DIR = "C:\\Documents and Settings\\Davis\\My Documents\\UnicodeData\\Update 3.0.1\\";
|
||||
static final boolean SKIP_FILE = true;
|
||||
|
||||
|
||||
static PrintWriter out = null;
|
||||
static BufferedReader in = null;
|
||||
|
||||
|
||||
static Normalizer nfc;
|
||||
static Normalizer nfd;
|
||||
static Normalizer nfkc;
|
||||
static Normalizer nfkd;
|
||||
static UCD ucd;
|
||||
|
||||
|
||||
static BitSet charsListed = new BitSet(0x110000);
|
||||
static int errorCount = 0;
|
||||
static int lineErrorCount = 0;
|
||||
static String originalLine = "";
|
||||
static String lastLine = "";
|
||||
|
||||
|
||||
public static void main(String[] args) throws java.io.IOException {
|
||||
System.out.println("Creating Normalizers");
|
||||
ucd = UCD.make("");
|
||||
|
||||
|
||||
nfc = new Normalizer(Normalizer.NFC);
|
||||
nfd = new Normalizer(Normalizer.NFD);
|
||||
nfkc = new Normalizer(Normalizer.NFKC);
|
||||
nfkd = new Normalizer(Normalizer.NFKD);
|
||||
|
||||
|
||||
String x = UTF32.valueOf32(0x10000);
|
||||
check("NFC", nfc, x);
|
||||
check("NFD", nfd, x);
|
||||
check("NFKC", nfkc, x);
|
||||
check("NFKD", nfkd, x);
|
||||
|
||||
|
||||
|
||||
|
||||
out = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream("NormalizationTestLog.txt"),
|
||||
"UTF8"),
|
||||
32*1024));
|
||||
|
||||
|
||||
in = new BufferedReader (
|
||||
new FileReader (DIR + "NormalizationTest.txt"),
|
||||
32*1024);
|
||||
|
||||
|
||||
try {
|
||||
String[] parts = new String[10];
|
||||
|
||||
|
||||
System.out.println("Checking files");
|
||||
|
||||
|
||||
int count = 0;
|
||||
|
||||
|
||||
while (true) {
|
||||
String line = in.readLine();
|
||||
if ((count++ & 0x3FF) == 0) System.out.println("#LINE: " + line);
|
||||
|
@ -69,69 +82,69 @@ public final class TestNormalization {
|
|||
}
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
|
||||
|
||||
|
||||
int splitCount = Utility.split(line, ';', parts);
|
||||
// FIX check splitCount
|
||||
for (int i = 0; i < splitCount; ++i) {
|
||||
parts[i] = Utility.fromHex(parts[i]);
|
||||
}
|
||||
|
||||
|
||||
if (UTF32.length32(parts[0]) == 1) {
|
||||
int code = UTF32.char32At(parts[0],0);
|
||||
charsListed.set(code);
|
||||
if ((code & 0x3FF) == 0) System.out.println("# " + Utility.hex(code));
|
||||
}
|
||||
|
||||
|
||||
// c2 == NFC(c1) == NFC(c2) == NFC(c3)
|
||||
errorCount += check("NFCa", nfc, parts[1], parts[0]);
|
||||
errorCount += check("NFCb", nfc, parts[1], parts[1]);
|
||||
errorCount += check("NFCc", nfc, parts[1], parts[2]);
|
||||
|
||||
errorCount += check("NFCa", nfc, parts[1], parts[0]);
|
||||
errorCount += check("NFCb", nfc, parts[1], parts[1]);
|
||||
errorCount += check("NFCc", nfc, parts[1], parts[2]);
|
||||
|
||||
// c4 == NFC(c4) == NFC(c5)
|
||||
errorCount += check("NFCd", nfc, parts[3], parts[3]);
|
||||
errorCount += check("NFCe", nfc, parts[3], parts[4]);
|
||||
errorCount += check("NFCd", nfc, parts[3], parts[3]);
|
||||
errorCount += check("NFCe", nfc, parts[3], parts[4]);
|
||||
|
||||
// c3 == NFD(c1) == NFD(c2) == NFD(c3)
|
||||
errorCount += check("NFDa", nfd, parts[2], parts[0]);
|
||||
errorCount += check("NFDb", nfd, parts[2], parts[1]);
|
||||
errorCount += check("NFDc", nfd, parts[2], parts[2]);
|
||||
|
||||
errorCount += check("NFDa", nfd, parts[2], parts[0]);
|
||||
errorCount += check("NFDb", nfd, parts[2], parts[1]);
|
||||
errorCount += check("NFDc", nfd, parts[2], parts[2]);
|
||||
|
||||
// c5 == NFD(c4) == NFD(c5)
|
||||
errorCount += check("NFDd", nfd, parts[4], parts[3]);
|
||||
errorCount += check("NFDe", nfd, parts[4], parts[4]);
|
||||
|
||||
errorCount += check("NFDd", nfd, parts[4], parts[3]);
|
||||
errorCount += check("NFDe", nfd, parts[4], parts[4]);
|
||||
|
||||
// c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
|
||||
errorCount += check("NFKCa", nfkc, parts[3], parts[0]);
|
||||
errorCount += check("NFKCb", nfkc, parts[3], parts[1]);
|
||||
errorCount += check("NFKCc", nfkc, parts[3], parts[2]);
|
||||
errorCount += check("NFKCd", nfkc, parts[3], parts[3]);
|
||||
errorCount += check("NFKCe", nfkc, parts[3], parts[4]);
|
||||
|
||||
errorCount += check("NFKCa", nfkc, parts[3], parts[0]);
|
||||
errorCount += check("NFKCb", nfkc, parts[3], parts[1]);
|
||||
errorCount += check("NFKCc", nfkc, parts[3], parts[2]);
|
||||
errorCount += check("NFKCd", nfkc, parts[3], parts[3]);
|
||||
errorCount += check("NFKCe", nfkc, parts[3], parts[4]);
|
||||
|
||||
// c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
|
||||
errorCount += check("NFKDa", nfkd, parts[4], parts[0]);
|
||||
errorCount += check("NFKDb", nfkd, parts[4], parts[1]);
|
||||
errorCount += check("NFKDc", nfkd, parts[4], parts[2]);
|
||||
errorCount += check("NFKDd", nfkd, parts[4], parts[3]);
|
||||
errorCount += check("NFKDe", nfkd, parts[4], parts[4]);
|
||||
errorCount += check("NFKDa", nfkd, parts[4], parts[0]);
|
||||
errorCount += check("NFKDb", nfkd, parts[4], parts[1]);
|
||||
errorCount += check("NFKDc", nfkd, parts[4], parts[2]);
|
||||
errorCount += check("NFKDd", nfkd, parts[4], parts[3]);
|
||||
errorCount += check("NFKDe", nfkd, parts[4], parts[4]);
|
||||
}
|
||||
System.out.println("Total errors in file: " + errorCount
|
||||
+ ", lines: " + lineErrorCount);
|
||||
errorCount = lineErrorCount = 0;
|
||||
|
||||
|
||||
System.out.println("Checking Missing");
|
||||
checkMissing();
|
||||
System.out.println("Total errors in unlisted items: " + errorCount
|
||||
+ ", lines: " + lineErrorCount);
|
||||
|
||||
|
||||
} finally {
|
||||
if (in != null) in.close();
|
||||
if (out != null) out.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static String lastBase = "";
|
||||
|
||||
|
||||
public static int check(String type, Normalizer n, String base, String other) {
|
||||
try {
|
||||
String trans = n.normalize(other);
|
||||
|
@ -149,8 +162,8 @@ public final class TestNormalization {
|
|||
if (!base.equals(other)) {
|
||||
otherList = "(" + ucd.getCodeAndName(other) + ")";
|
||||
}
|
||||
out.println("DIFF " + type + ": "
|
||||
+ ucd.getCodeAndName(base) + " != "
|
||||
out.println("DIFF " + type + ": "
|
||||
+ ucd.getCodeAndName(base) + " != "
|
||||
+ type
|
||||
+ otherList
|
||||
+ " == " + ucd.getCodeAndName(trans)
|
||||
|
@ -159,17 +172,17 @@ public final class TestNormalization {
|
|||
return 1;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("DIFF " + type + ": "
|
||||
+ ucd.getCodeAndName(base) + " != "
|
||||
throw new ChainException("DIFF " + type + ": "
|
||||
+ ucd.getCodeAndName(base) + " != "
|
||||
+ type + "(" + ucd.getCodeAndName(other) + ")", new Object[]{}, e);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
public static int check(String type, Normalizer n, String base) {
|
||||
return check(type, n, base, base);
|
||||
}
|
||||
|
||||
|
||||
static void checkMissing() {
|
||||
for (int missing = 0; missing < 0x100000; ++missing) {
|
||||
if ((missing & 0xFFF) == 0) System.out.println("# " + Utility.hex(missing));
|
||||
|
@ -180,6 +193,6 @@ public final class TestNormalization {
|
|||
errorCount += check("NFKC", nfkc, x);
|
||||
errorCount += check("NFKD", nfkd, x);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
@ -16,14 +29,14 @@ public final class UCD implements UCD_Types {
|
|||
* Used for the default version.
|
||||
*/
|
||||
public static final String latestVersion = "3.1.1";
|
||||
|
||||
|
||||
/**
|
||||
* Create singleton instance for default (latest) version
|
||||
*/
|
||||
public static UCD make() {
|
||||
return make("");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create singleton instance for the specific version
|
||||
*/
|
||||
|
@ -37,21 +50,21 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the version of the UCD
|
||||
*/
|
||||
public String getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the date that the data was parsed
|
||||
*/
|
||||
public long getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Is the code point allocated?
|
||||
*/
|
||||
|
@ -64,14 +77,14 @@ public final class UCD implements UCD_Types {
|
|||
if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF && major >= 3 && minor >= 1) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Is the code point assigned to a character (or surrogate)
|
||||
*/
|
||||
public boolean isAssigned(int codePoint) {
|
||||
return getCategory(codePoint) != Cn;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Is the code point a PUA character (fast check)
|
||||
*/
|
||||
|
@ -80,7 +93,7 @@ public final class UCD implements UCD_Types {
|
|||
|| codePoint >= 0xF0000 && codePoint < 0xFFFFE
|
||||
|| codePoint >= 0x100000 && codePoint < 0x10FFFE);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Many ranges are elided in the UCD. All but the first are not actually
|
||||
* represented in the data internally. This detects such cases.
|
||||
|
@ -88,21 +101,21 @@ public final class UCD implements UCD_Types {
|
|||
public boolean isRepresented(int codePoint) {
|
||||
return getRaw(codePoint) != null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return XML version of the data associated with the code point.
|
||||
*/
|
||||
public String toString(int codePoint) {
|
||||
return get(codePoint, true).toString(FULL);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the character name.
|
||||
*/
|
||||
public String getName(int codePoint) {
|
||||
return get(codePoint, true).name;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the character names for the code points in a string, separated by ", "
|
||||
*/
|
||||
|
@ -117,14 +130,14 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the code in U+ notation
|
||||
*/
|
||||
public static String getCode(int codePoint) {
|
||||
return "U+" + Utility.hex(codePoint);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the code in U+ notation
|
||||
*/
|
||||
|
@ -139,14 +152,14 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the name and number (U+xxxx NAME) for a code point
|
||||
*/
|
||||
public String getCodeAndName(int codePoint) {
|
||||
return getCode(codePoint) + " " + getName(codePoint);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the name and number (U+xxxx NAME) for the code points in a string,
|
||||
* separated by ", "
|
||||
|
@ -163,14 +176,14 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the general category
|
||||
*/
|
||||
public byte getCategory(int codePoint) {
|
||||
return get(codePoint, false).generalCategory;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the main category, as a mask
|
||||
*/
|
||||
|
@ -187,7 +200,7 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
throw new IllegalArgumentException ("Illegal General Category " + cat);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the combining class, a number between zero and 255. Returned
|
||||
* as a short to avoid the signed-byte problem in Java
|
||||
|
@ -195,46 +208,46 @@ public final class UCD implements UCD_Types {
|
|||
public short getCombiningClass(int codePoint) {
|
||||
return (short)(get(codePoint, false).combiningClass & 0xFF);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Does this combining class actually occur in this version of the data.
|
||||
*/
|
||||
public boolean isCombiningClassUsed(byte value) {
|
||||
return combiningClassSet.get(0xFF & value);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the bidi class
|
||||
*/
|
||||
public byte getBidiClass(int codePoint) {
|
||||
return get(codePoint, false).bidiClass;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the RAW decomposition mapping. Must be used recursively for the full mapping!
|
||||
*/
|
||||
public String getDecompositionMapping(int codePoint) {
|
||||
return get(codePoint, true).decompositionMapping;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get BIDI mirroring character, if there is one.
|
||||
*/
|
||||
public String getBidiMirror(int codePoint) {
|
||||
return get(codePoint, true).bidiMirror;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the RAW decomposition type: the <...> field in the UCD data.
|
||||
*/
|
||||
public byte getDecompositionType(int codePoint) {
|
||||
return get(codePoint, false).decompositionType;
|
||||
}
|
||||
|
||||
|
||||
public float getNumericValue(int codePoint) {
|
||||
return get(codePoint, false).numericValue;
|
||||
}
|
||||
|
||||
|
||||
public byte getNumericType(int codePoint) {
|
||||
return get(codePoint, false).numericType;
|
||||
}
|
||||
|
@ -242,11 +255,11 @@ public final class UCD implements UCD_Types {
|
|||
public String getCase(int codePoint, byte simpleVsFull, byte caseType) {
|
||||
return getCase(codePoint, simpleVsFull, caseType, "");
|
||||
}
|
||||
|
||||
|
||||
public String getCase(String s, byte simpleVsFull, byte caseType) {
|
||||
return getCase(s, simpleVsFull, caseType, "");
|
||||
}
|
||||
|
||||
|
||||
public String getCase(int codePoint, byte simpleVsFull, byte caseType, String condition) {
|
||||
UData udata = get(codePoint, true);
|
||||
if (caseType < LOWER || caseType > FOLD
|
||||
|
@ -255,7 +268,7 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
if (caseType < FOLD) {
|
||||
if (simpleVsFull == FULL && udata.specialCasing.length() != 0) {
|
||||
if (condition.length() == 0
|
||||
if (condition.length() == 0
|
||||
|| udata.specialCasing.indexOf(condition) < 0) {
|
||||
simpleVsFull = SIMPLE;
|
||||
}
|
||||
|
@ -268,7 +281,7 @@ public final class UCD implements UCD_Types {
|
|||
else simpleVsFull = FULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
switch (caseType + simpleVsFull) {
|
||||
case SIMPLE + UPPER: return udata.simpleUppercase;
|
||||
case SIMPLE + LOWER: return udata.simpleLowercase;
|
||||
|
@ -281,7 +294,7 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
throw new IllegalArgumentException("getCase: " + caseType + ", " + simpleVsFull);
|
||||
}
|
||||
|
||||
|
||||
public String getCase(String s, byte simpleVsFull, byte caseType, String condition) {
|
||||
if (UTF32.length32(s) == 1) return getCase(UTF32.char32At(s, 0), simpleVsFull, caseType);
|
||||
StringBuffer result = new StringBuffer();
|
||||
|
@ -291,7 +304,7 @@ public final class UCD implements UCD_Types {
|
|||
cp = UTF32.char32At(s, i);
|
||||
String mappedVersion = getCase(cp, simpleVsFull, currentCaseType, condition);
|
||||
result.append(mappedVersion);
|
||||
if (caseType == TITLE) {
|
||||
if (caseType == TITLE) {
|
||||
// if letter is cased, change to lowercase, otherwise change to TITLE
|
||||
byte cat = getCategory(cp);
|
||||
if (cat == Mn || cat == Me || cat == Mc) {
|
||||
|
@ -307,60 +320,60 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
public String getSimpleLowercase(int codePoint) {
|
||||
return get(codePoint, true).simpleLowercase;
|
||||
}
|
||||
|
||||
|
||||
public String getSimpleUppercase(int codePoint) {
|
||||
return get(codePoint, true).simpleUppercase;
|
||||
}
|
||||
|
||||
|
||||
public String getSimpleTitlecase(int codePoint) {
|
||||
return get(codePoint, true).simpleTitlecase;
|
||||
}
|
||||
|
||||
|
||||
public String getSimpleCaseFolding(int codePoint) {
|
||||
return get(codePoint, true).simpleCaseFolding;
|
||||
}
|
||||
|
||||
|
||||
public String getFullLowercase(int codePoint) {
|
||||
return get(codePoint, true).fullLowercase;
|
||||
}
|
||||
|
||||
|
||||
public String getFullUppercase(int codePoint) {
|
||||
return get(codePoint, true).fullUppercase;
|
||||
}
|
||||
|
||||
|
||||
public String getFullTitlecase(int codePoint) {
|
||||
return get(codePoint, true).fullTitlecase;
|
||||
}
|
||||
|
||||
|
||||
public String getFullCaseFolding(int codePoint) {
|
||||
return get(codePoint, true).simpleCaseFolding;
|
||||
}
|
||||
|
||||
|
||||
public String getLowercase(int codePoint, boolean full) {
|
||||
if (full) return getFullLowercase(codePoint);
|
||||
return getSimpleLowercase(codePoint);
|
||||
}
|
||||
|
||||
|
||||
public String getUppercase(int codePoint, boolean full) {
|
||||
if (full) return getFullUppercase(codePoint);
|
||||
return getSimpleLowercase(codePoint);
|
||||
}
|
||||
|
||||
|
||||
public String getTitlecase(int codePoint, boolean full) {
|
||||
if (full) return getFullTitlecase(codePoint);
|
||||
return getSimpleTitlecase(codePoint);
|
||||
}
|
||||
|
||||
|
||||
public String getCaseFolding(int codePoint, boolean full) {
|
||||
if (full) return getFullCaseFolding(codePoint);
|
||||
return getSimpleCaseFolding(codePoint);
|
||||
}
|
||||
|
||||
|
||||
public String getLowercase(String s, boolean full) {
|
||||
if (s.length() == 1) return getLowercase(s.charAt(0), true);
|
||||
StringBuffer result = new StringBuffer();
|
||||
|
@ -372,7 +385,7 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
public String getUppercase(String s, boolean full) {
|
||||
if (s.length() == 1) return getUppercase(s.charAt(0), true);
|
||||
StringBuffer result = new StringBuffer();
|
||||
|
@ -384,7 +397,7 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
public String getTitlecase(String s, boolean full) {
|
||||
if (s.length() == 1) return getTitlecase(s.charAt(0), true);
|
||||
StringBuffer result = new StringBuffer();
|
||||
|
@ -396,7 +409,7 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
public String getCaseFolding(String s, boolean full) {
|
||||
if (s.length() == 1) return getCaseFolding(s.charAt(0), true);
|
||||
StringBuffer result = new StringBuffer();
|
||||
|
@ -409,184 +422,184 @@ public final class UCD implements UCD_Types {
|
|||
return result.toString();
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
public String getSpecialCase(int codePoint) {
|
||||
return get(codePoint, true).specialCasing;
|
||||
}
|
||||
|
||||
|
||||
public byte getEastAsianWidth(int codePoint) {
|
||||
return get(codePoint, false).eastAsianWidth;
|
||||
}
|
||||
|
||||
|
||||
public byte getLineBreak(int codePoint) {
|
||||
return get(codePoint, false).lineBreak;
|
||||
}
|
||||
|
||||
|
||||
public byte getScript(int codePoint) {
|
||||
return get(codePoint, false).script;
|
||||
}
|
||||
|
||||
|
||||
public byte getAge(int codePoint) {
|
||||
return get(codePoint, false).age;
|
||||
}
|
||||
|
||||
|
||||
public byte getJoiningType(int codePoint) {
|
||||
return get(codePoint, false).joiningType;
|
||||
}
|
||||
|
||||
|
||||
public byte getJoiningGroup(int codePoint) {
|
||||
return get(codePoint, false).joiningGroup;
|
||||
}
|
||||
|
||||
|
||||
public int getBinaryProperties(int codePoint) {
|
||||
return get(codePoint, false).binaryProperties;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public boolean getBinaryProperty(int codePoint, int bit) {
|
||||
return (get(codePoint, false).binaryProperties & (1<<bit)) != 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// ENUM Mask Utilties
|
||||
|
||||
|
||||
public int getCategoryMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).generalCategory;
|
||||
}
|
||||
|
||||
|
||||
public int getBidiClassMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).bidiClass;
|
||||
}
|
||||
|
||||
|
||||
public int getNumericTypeMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).numericType;
|
||||
}
|
||||
|
||||
|
||||
public int getDecompositionTypeMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).decompositionType;
|
||||
}
|
||||
|
||||
|
||||
public int getEastAsianWidthMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).eastAsianWidth;
|
||||
}
|
||||
|
||||
|
||||
public int getLineBreakMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).lineBreak;
|
||||
}
|
||||
|
||||
|
||||
public int getScriptMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).script;
|
||||
}
|
||||
|
||||
|
||||
public int getAgeMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).age;
|
||||
}
|
||||
|
||||
|
||||
public int getJoiningTypeMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).joiningType;
|
||||
}
|
||||
|
||||
|
||||
public int getJoiningGroupMask(int codePoint) {
|
||||
return 1<<get(codePoint, false).joiningGroup;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// VERSIONS WITH NAMES
|
||||
|
||||
|
||||
public String getCategoryID(int codePoint) {
|
||||
return getCategoryID_fromIndex(getCategory(codePoint));
|
||||
}
|
||||
|
||||
|
||||
public static String getCategoryID_fromIndex(byte prop) {
|
||||
return UCD_Names.GC[prop];
|
||||
}
|
||||
|
||||
|
||||
public String getBidiClassID(int codePoint) {
|
||||
return getBidiClassID_fromIndex(getBidiClass(codePoint));
|
||||
}
|
||||
|
||||
|
||||
public static String getBidiClassID_fromIndex(byte prop) {
|
||||
return UCD_Names.BC[prop];
|
||||
}
|
||||
|
||||
|
||||
public String getCombiningClassID(int codePoint) {
|
||||
return getCombiningClassID_fromIndex(getCombiningClass(codePoint));
|
||||
}
|
||||
|
||||
|
||||
public static String getCombiningClassID_fromIndex(short cc) {
|
||||
return cc + "";
|
||||
}
|
||||
|
||||
|
||||
public String getDecompositionTypeID(int codePoint) {
|
||||
return getDecompositionTypeID_fromIndex(getDecompositionType(codePoint));
|
||||
}
|
||||
|
||||
|
||||
public static String getDecompositionTypeID_fromIndex(byte prop) {
|
||||
return UCD_Names.DT[prop];
|
||||
}
|
||||
|
||||
|
||||
public String getNumericTypeID(int codePoint) {
|
||||
return getNumericTypeID_fromIndex(getNumericType(codePoint));
|
||||
}
|
||||
|
||||
|
||||
public static String getNumericTypeID_fromIndex(byte prop) {
|
||||
return UCD_Names.NT[prop];
|
||||
}
|
||||
|
||||
|
||||
public String getEastAsianWidthID(int codePoint) {
|
||||
return getEastAsianWidthID_fromIndex(getEastAsianWidth(codePoint));
|
||||
}
|
||||
|
||||
|
||||
public static String getEastAsianWidthID_fromIndex(byte prop) {
|
||||
return UCD_Names.EA[prop];
|
||||
}
|
||||
|
||||
|
||||
public String getLineBreakID(int codePoint) {
|
||||
return getLineBreakID_fromIndex(getLineBreak(codePoint));
|
||||
}
|
||||
|
||||
|
||||
public static String getLineBreakID_fromIndex(byte prop) {
|
||||
return UCD_Names.LB[prop];
|
||||
}
|
||||
|
||||
|
||||
public String getJoiningTypeID(int codePoint) {
|
||||
return getJoiningTypeID_fromIndex(getJoiningType(codePoint));
|
||||
}
|
||||
|
||||
|
||||
public static String getJoiningTypeID_fromIndex(byte prop) {
|
||||
return UCD_Names.JOINING_TYPE[prop];
|
||||
}
|
||||
|
||||
|
||||
public String getJoiningGroupID(int codePoint) {
|
||||
return getJoiningGroupID_fromIndex(getJoiningGroup(codePoint));
|
||||
}
|
||||
|
||||
|
||||
public static String getJoiningGroupID_fromIndex(byte prop) {
|
||||
return UCD_Names.JOINING_GROUP[prop];
|
||||
}
|
||||
|
||||
|
||||
public String getScriptID(int codePoint) {
|
||||
return getScriptID_fromIndex(getScript(codePoint));
|
||||
}
|
||||
|
||||
|
||||
public static String getScriptID_fromIndex(byte prop) {
|
||||
return UCD_Names.SCRIPT[prop];
|
||||
}
|
||||
|
||||
|
||||
public String getAgeID(int codePoint) {
|
||||
return getAgeID_fromIndex(getAge(codePoint));
|
||||
}
|
||||
|
||||
|
||||
public static String getAgeID_fromIndex(byte prop) {
|
||||
return UCD_Names.AGE[prop];
|
||||
}
|
||||
|
||||
|
||||
public String getBinaryPropertiesID(int codePoint, byte bit) {
|
||||
return (getBinaryProperties(codePoint) & (1<<bit)) != 0 ? "Y" : "N";
|
||||
}
|
||||
|
||||
|
||||
public static String getBinaryPropertiesID_fromIndex(byte bit) {
|
||||
return UCD_Names.BP[bit];
|
||||
}
|
||||
|
||||
|
||||
public static int mapToRepresentative(int ch, boolean old) {
|
||||
if (ch <= 0xFFFD) {
|
||||
//if (ch <= 0x2800) return ch;
|
||||
|
@ -624,7 +637,7 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
return ch;
|
||||
}
|
||||
|
||||
|
||||
public boolean isIdentifierStart(int cp, boolean extended) {
|
||||
if (extended) {
|
||||
if (cp == 0x0E33 || cp == 0x0EB3 || cp == 0xFF9E || cp == 0xFF9F) return false;
|
||||
|
@ -635,7 +648,7 @@ public final class UCD implements UCD_Types {
|
|||
if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public boolean isIdentifierContinue_NO_Cf(int cp, boolean extended) {
|
||||
if (isIdentifierStart(cp, extended)) return true;
|
||||
if (extended) {
|
||||
|
@ -646,7 +659,7 @@ public final class UCD implements UCD_Types {
|
|||
if (cat == Mn || cat == Mc || cat == Nd || cat == Pc) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public boolean isIdentifier(String s, boolean extended) {
|
||||
if (s.length() == 0) return false; // at least one!
|
||||
int cp;
|
||||
|
@ -661,34 +674,34 @@ public final class UCD implements UCD_Types {
|
|||
return true;
|
||||
}
|
||||
/*
|
||||
Middle Dot. Because most Catalan legacy data will be encoded in Latin-1, U+00B7 MIDDLE DOT needs to be
|
||||
Middle Dot. Because most Catalan legacy data will be encoded in Latin-1, U+00B7 MIDDLE DOT needs to be
|
||||
allowed in <identifier_extend>.
|
||||
|
||||
In particular, the following four characters should be in <identifier_extend> and not <identifier_start>:
|
||||
0E33 THAI CHARACTER SARA AM
|
||||
0EB3 LAO VOWEL SIGN AM
|
||||
FF9E HALFWIDTH KATAKANA VOICED SOUND MARK
|
||||
FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
|
||||
In particular, the following four characters should be in <identifier_extend> and not <identifier_start>:
|
||||
0E33 THAI CHARACTER SARA AM
|
||||
0EB3 LAO VOWEL SIGN AM
|
||||
FF9E HALFWIDTH KATAKANA VOICED SOUND MARK
|
||||
FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
|
||||
Irregularly decomposing characters. U+037A GREEK YPOGEGRAMMENI and certain Arabic presentation
|
||||
forms have irregular compatibility decompositions, and need to be excluded from both <identifier_start>
|
||||
and <identifier_extend>. It is recommended that all Arabic presentation forms be excluded from identifiers
|
||||
in any event, although only a few of them are required to be excluded for normalization
|
||||
to guarantee identifier closure.
|
||||
to guarantee identifier closure.
|
||||
*/
|
||||
|
||||
// *******************
|
||||
|
||||
// *******************
|
||||
// PRIVATES
|
||||
// *******************
|
||||
|
||||
// *******************
|
||||
|
||||
// cache of singletons
|
||||
private static Map versionCache = new HashMap();
|
||||
|
||||
|
||||
private static final int LIMIT_CODE_POINT = 0x110000;
|
||||
private static final UData[] ALL_NULLS = new UData[1024];
|
||||
|
||||
|
||||
// main data
|
||||
private UData[][] data = new UData[LIMIT_CODE_POINT>>10][];
|
||||
|
||||
|
||||
// extras
|
||||
private BitSet combiningClassSet = new BitSet(256);
|
||||
private String version;
|
||||
|
@ -699,19 +712,19 @@ to guarantee identifier closure.
|
|||
private byte minor = -1;
|
||||
private byte update = -1;
|
||||
private int size = -1;
|
||||
|
||||
|
||||
// cache last UData
|
||||
private int lastCode = Integer.MIN_VALUE;
|
||||
private UData lastResult = UData.UNASSIGNED;
|
||||
private boolean lastCodeFixed = false;
|
||||
|
||||
|
||||
// hide constructor
|
||||
private UCD() {
|
||||
for (int i = 0; i < data.length; ++i) {
|
||||
data[i] = ALL_NULLS;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void add(UData uData) {
|
||||
int high = uData.codePoint>>10;
|
||||
if (data[high] == ALL_NULLS) {
|
||||
|
@ -720,7 +733,7 @@ to guarantee identifier closure.
|
|||
}
|
||||
data[high][uData.codePoint & 0x3FF] = uData;
|
||||
}
|
||||
|
||||
|
||||
public boolean hasComputableName(int codePoint) {
|
||||
if (codePoint >= 0xF900 && codePoint <= 0xFA2D) return true;
|
||||
int rangeStart = mapToRepresentative(codePoint, major < 2);
|
||||
|
@ -744,11 +757,11 @@ to guarantee identifier closure.
|
|||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private UData getRaw(int codePoint) {
|
||||
return data[codePoint>>10][codePoint & 0x3FF];
|
||||
}
|
||||
|
||||
|
||||
// access data for codepoint
|
||||
UData get(int codePoint, boolean fixStrings) {
|
||||
//if (codePoint == lastCode && fixStrings <= lastCodeFixed) return lastResult;
|
||||
|
@ -756,7 +769,7 @@ to guarantee identifier closure.
|
|||
// we play some funny tricks for performance
|
||||
// if cp is not represented, it is either in a elided block or missing.
|
||||
// elided blocks are either CONTINUE or FFFF
|
||||
|
||||
|
||||
byte cat;
|
||||
if (!ucdData.isRepresented(cp)) {
|
||||
int rep = UCD.mapToRepresentative(cp);
|
||||
|
@ -768,9 +781,9 @@ to guarantee identifier closure.
|
|||
cat = ucdData.getCategory(cp);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
UData result = null;
|
||||
|
||||
|
||||
// do range stuff
|
||||
String constructedName = null;
|
||||
int rangeStart = mapToRepresentative(codePoint, major < 2);
|
||||
|
@ -820,7 +833,7 @@ to guarantee identifier closure.
|
|||
if (fixStrings) result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
result.codePoint = codePoint;
|
||||
if (fixStrings) {
|
||||
result.name = constructedName;
|
||||
|
@ -835,10 +848,10 @@ to guarantee identifier closure.
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// Hangul constants
|
||||
|
||||
static final int
|
||||
|
||||
static final int
|
||||
SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
|
||||
LCount = 19, VCount = 21, TCount = 28,
|
||||
NCount = VCount * TCount, // 588
|
||||
|
@ -859,9 +872,9 @@ to guarantee identifier closure.
|
|||
// if (true) return "?";
|
||||
return UCD_Names.JAMO_L_TABLE[LIndex] + UCD_Names.JAMO_V_TABLE[VIndex] + UCD_Names.JAMO_T_TABLE[TIndex];
|
||||
}
|
||||
|
||||
|
||||
private static final char[] pair = new char[2];
|
||||
|
||||
|
||||
static String getHangulDecompositionPair(int ch) {
|
||||
int SIndex = ch - SBase;
|
||||
if (0 > SIndex || SIndex >= SCount) {
|
||||
|
@ -877,7 +890,7 @@ to guarantee identifier closure.
|
|||
}
|
||||
return String.valueOf(pair);
|
||||
}
|
||||
|
||||
|
||||
static int composeHangul(int char1, int char2) {
|
||||
if (LBase <= char1 && char1 < LLimit && VBase <= char2 && char2 < VLimit) {
|
||||
return (SBase + ((char1 - LBase) * VCount + (char2 - VBase)) * TCount);
|
||||
|
@ -888,11 +901,11 @@ to guarantee identifier closure.
|
|||
}
|
||||
return 0xFFFF; // no composition
|
||||
}
|
||||
|
||||
|
||||
static boolean isTrailingJamo(int cp) {
|
||||
return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
|
||||
}
|
||||
|
||||
|
||||
private void fillFromFile(String version) {
|
||||
DataInputStream dataIn = null;
|
||||
String fileName = BIN_DIR + "UCD_Data" + version + ".bin";
|
||||
|
@ -909,58 +922,58 @@ to guarantee identifier closure.
|
|||
update = dataIn.readByte();
|
||||
String foundVersion = major + "." + minor + "." + update;
|
||||
if (format != BINARY_FORMAT || !version.equals(foundVersion)) {
|
||||
throw new ChainException("Illegal data file format for {0}: {1}, {2}",
|
||||
throw new ChainException("Illegal data file format for {0}: {1}, {2}",
|
||||
new Object[]{version, new Byte(format), foundVersion});
|
||||
}
|
||||
date = dataIn.readLong();
|
||||
size = uDataFileCount = dataIn.readInt();
|
||||
|
||||
|
||||
boolean didJoiningHack = false;
|
||||
|
||||
|
||||
|
||||
// records
|
||||
for (int i = 0; i < uDataFileCount; ++i) {
|
||||
UData uData = new UData();
|
||||
uData.readBytes(dataIn);
|
||||
|
||||
|
||||
if (uData.codePoint == 0x2801) {
|
||||
System.out.println("SPOT-CHECK: " + uData);
|
||||
}
|
||||
|
||||
|
||||
//T = Mc + (Cf - ZWNJ - ZWJ)
|
||||
int cp = uData.codePoint;
|
||||
byte old = uData.joiningType;
|
||||
byte cat = uData.generalCategory;
|
||||
//if (cp == 0x200D) {
|
||||
// uData.joiningType = JT_C;
|
||||
//} else
|
||||
//} else
|
||||
if (cp != 0x200D && cp != 0x200C && (cat == Mn || cat == Cf)) {
|
||||
uData.joiningType = JT_T;
|
||||
}
|
||||
if (!didJoiningHack && uData.joiningType != old) {
|
||||
System.out.println("HACK: Setting "
|
||||
+ UCD_Names.LONG_JOINING_TYPE[uData.joiningType]
|
||||
System.out.println("HACK: Setting "
|
||||
+ UCD_Names.LONG_JOINING_TYPE[uData.joiningType]
|
||||
+ ": " + Utility.hex(cp) + " " + uData.name);
|
||||
didJoiningHack = true;
|
||||
}
|
||||
|
||||
|
||||
combiningClassSet.set(uData.combiningClass & 0xFF);
|
||||
add(uData);
|
||||
}
|
||||
/*
|
||||
if (update == -1) {
|
||||
throw new ChainException("Data File truncated for ",
|
||||
throw new ChainException("Data File truncated for ",
|
||||
new Object[]{version}, e);
|
||||
}
|
||||
if (size != fileSize) {
|
||||
throw new ChainException("Counts do not match: file {0}, records {1}",
|
||||
throw new ChainException("Counts do not match: file {0}, records {1}",
|
||||
new Object[]{new Integer(fileSize), new Integer(size)});
|
||||
}
|
||||
*/
|
||||
// everything is ok!
|
||||
this.version = version;
|
||||
this.file = fileName;
|
||||
//+ " " + new File(fileName).lastModified();
|
||||
//+ " " + new File(fileName).lastModified();
|
||||
} catch (IOException e) {
|
||||
throw new ChainException("Can't read data file for {0}", new Object[]{version}, e);
|
||||
} finally {
|
||||
|
@ -971,4 +984,4 @@ to guarantee identifier closure.
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,16 +1,29 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
|
||||
* $Date: 2001/08/31 00:29:50 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
|
||||
final class UCD_Names implements UCD_Types {
|
||||
|
||||
|
||||
static final String[] UNIFIED_PROPERTIES = {
|
||||
"General Category (listing UnicodeData.txt, field 2: see UnicodeData.html)",
|
||||
"Combining Class (listing UnicodeData.txt, field 3: see UnicodeData.html)",
|
||||
"Bidi Class (listing UnicodeData.txt, field 4: see UnicodeData.html)",
|
||||
"Decomposition Type (from UnicodeData.txt, field 5: see UnicodeData.html)",
|
||||
"Numeric Type (from UnicodeData.txt, field 6/7/8: see UnicodeData.html)",
|
||||
"Numeric Type (from UnicodeData.txt, field 6/7/8: see UnicodeData.html)",
|
||||
"East Asian Width (listing EastAsianWidth.txt, field 1)",
|
||||
"Line Break (listing LineBreak.txt, field 1)",
|
||||
"Joining Type (listing ArabicShaping.txt, field 1).\r\n"
|
||||
|
@ -21,13 +34,13 @@ final class UCD_Names implements UCD_Types {
|
|||
"Script",
|
||||
"Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)"
|
||||
};
|
||||
|
||||
|
||||
static final String[] SHORT_UNIFIED_PROPERTIES = {
|
||||
"GeneralCategory",
|
||||
"CombiningClass",
|
||||
"BidiClass",
|
||||
"DecompositionType",
|
||||
"NumericType",
|
||||
"NumericType",
|
||||
"EastAsianWidth",
|
||||
"LineBreak",
|
||||
"JoiningType",
|
||||
|
@ -36,13 +49,13 @@ final class UCD_Names implements UCD_Types {
|
|||
"Script",
|
||||
"Age"
|
||||
};
|
||||
|
||||
|
||||
static final String[] ABB_UNIFIED_PROPERTIES = {
|
||||
"gc",
|
||||
"cc",
|
||||
"bc",
|
||||
"dt",
|
||||
"nt",
|
||||
"nt",
|
||||
"ea",
|
||||
"lb",
|
||||
"jt",
|
||||
|
@ -51,11 +64,11 @@ final class UCD_Names implements UCD_Types {
|
|||
"sc",
|
||||
"Ag"
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
static final String[] BP = {
|
||||
"BidiMirrored",
|
||||
"CompositionExclusion",
|
||||
"CompositionExclusion",
|
||||
"White_Space",
|
||||
"NonBreak",
|
||||
"Bidi_Control",
|
||||
|
@ -84,10 +97,10 @@ final class UCD_Names implements UCD_Types {
|
|||
"Reserved_Cf_Code_Point",
|
||||
"Deprecated",
|
||||
};
|
||||
|
||||
|
||||
static final String[] SHORT_BP = {
|
||||
"BidiM",
|
||||
"CExc",
|
||||
"CExc",
|
||||
"WhSp",
|
||||
"NBrk",
|
||||
"BdCon",
|
||||
|
@ -116,11 +129,11 @@ final class UCD_Names implements UCD_Types {
|
|||
"RCf",
|
||||
"Dep",
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
static final String[] BP_OLD = {
|
||||
"BidiMirrored",
|
||||
"CompositionExclusion",
|
||||
"CompositionExclusion",
|
||||
"White_space",
|
||||
"Non_break",
|
||||
"Bidi_Control",
|
||||
|
@ -146,7 +159,7 @@ final class UCD_Names implements UCD_Types {
|
|||
"UnifiedIdeograph"
|
||||
};
|
||||
*/
|
||||
|
||||
|
||||
static final String[] DeletedProperties = {
|
||||
"Private_Use",
|
||||
"Composite",
|
||||
|
@ -158,17 +171,17 @@ final class UCD_Names implements UCD_Types {
|
|||
"Private_Use_High_Surrogate",
|
||||
"Unassigned_Code_Point"
|
||||
};
|
||||
|
||||
|
||||
static final String[] YN_TABLE = {"N", "Y"};
|
||||
|
||||
|
||||
static String[] EA = {
|
||||
"N", "A", "H", "W", "F", "Na"
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
static String[] SHORT_EA = {
|
||||
"Neutral", "Ambiguous", "Halfwidth", "Wide", "Fullwidth", "Narrow"
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
static final String[] LB = {
|
||||
"XX", "OP", "CL", "QU", "GL", "NS", "EX", "SY",
|
||||
"IS", "PR", "PO", "NU", "AL", "ID", "IN", "HY",
|
||||
|
@ -177,11 +190,11 @@ final class UCD_Names implements UCD_Types {
|
|||
};
|
||||
|
||||
static final String[] LONG_LB = {
|
||||
"Unknown", "OpenPunctuation", "ClosePunctuation", "Quotation",
|
||||
"Unknown", "OpenPunctuation", "ClosePunctuation", "Quotation",
|
||||
"Glue", "Nonstarter", "Exclamation", "BreakSymbols",
|
||||
"InfixNumeric", "PrefixNumeric", "PostfixNumeric",
|
||||
"InfixNumeric", "PrefixNumeric", "PostfixNumeric",
|
||||
"Numeric", "Alphabetic", "Ideographic", "Inseperable", "Hyphen",
|
||||
"CombiningMark", "BreakBefore", "BreakAfter", "Space",
|
||||
"CombiningMark", "BreakBefore", "BreakAfter", "Space",
|
||||
"MandatoryBreak", "CarriageReturn", "LineFeed", "ContingentBreak",
|
||||
"ComplexContext", "Ambiguous", "BreakBeforeAndAfter", "Surrogate", "ZWSpace"
|
||||
};
|
||||
|
@ -230,7 +243,7 @@ final class UCD_Names implements UCD_Types {
|
|||
"DESERET",
|
||||
"INHERITED",
|
||||
};
|
||||
|
||||
|
||||
public static final String[] ABB_SCRIPT = {
|
||||
"Zyyy", // COMMON -- NOT A LETTER: NO EXACT CORRESPONDENCE IN 15924
|
||||
"Latn", // LATIN
|
||||
|
@ -275,17 +288,17 @@ final class UCD_Names implements UCD_Types {
|
|||
"Dsrt",
|
||||
"Qaai",
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static final String[] AGE = {
|
||||
"UNSPECIFIED",
|
||||
"1.1",
|
||||
"2.0", "2.1",
|
||||
"3.0", "3.1"
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
static final String[] GC = {
|
||||
"Cn", // = Other, Not Assigned 0
|
||||
|
||||
|
@ -328,7 +341,7 @@ final class UCD_Names implements UCD_Types {
|
|||
"Pi", // = Punctuation, Initial quote 29 (may behave like Ps or Pe depending on usage)
|
||||
"Pf" // = Punctuation, Final quote 30 (may behave like Ps or Pe dependingon usage)
|
||||
};
|
||||
|
||||
|
||||
static final String[] LONG_GC = {
|
||||
"Unassigned", // = Other, Not Assigned 0
|
||||
|
||||
|
@ -372,7 +385,7 @@ final class UCD_Names implements UCD_Types {
|
|||
"FinalPunctuation" // = Punctuation, Final quote 30 (may behave like Ps or Pe dependingon usage)
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
static String[] BC = {
|
||||
"L", // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
|
||||
|
@ -388,7 +401,7 @@ final class UCD_Names implements UCD_Types {
|
|||
"ON", // Other Neutrals ; All other characters: punctuation, symbols
|
||||
"<unused>", "BN", "NSM", "AL", "LRO", "RLO", "LRE", "RLE", "PDF"
|
||||
};
|
||||
|
||||
|
||||
static String[] LONG_BC = {
|
||||
"LeftToRight", // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
|
||||
"RightToLeft", // Right-Left; Arabic, Hebrew, and punctuation specific to those scripts
|
||||
|
@ -401,13 +414,13 @@ final class UCD_Names implements UCD_Types {
|
|||
"SegmentSeparator", // Segment Separator
|
||||
"WhiteSpace", // Whitespace
|
||||
"OtherNeutral", // Other Neutrals ; All other characters: punctuation, symbols
|
||||
"<unused>",
|
||||
"BoundaryNeutral", "NonspacingMark", "ArabicLetter",
|
||||
"LeftToRightOverride",
|
||||
"RightToLeftOverride", "LeftToRightEmbedding",
|
||||
"<unused>",
|
||||
"BoundaryNeutral", "NonspacingMark", "ArabicLetter",
|
||||
"LeftToRightOverride",
|
||||
"RightToLeftOverride", "LeftToRightEmbedding",
|
||||
"RightToLeftEmbedding", "PopDirectionalFormat"
|
||||
};
|
||||
|
||||
|
||||
private static String[] CASE_TABLE = {
|
||||
"LOWER", "TITLE", "UPPER", "UNCASED"
|
||||
};
|
||||
|
@ -432,7 +445,7 @@ final class UCD_Names implements UCD_Types {
|
|||
"square", // A CJK squared font variant.
|
||||
"fraction", // A vulgar fraction form.
|
||||
};
|
||||
|
||||
|
||||
static String[] SHORT_DT = {
|
||||
"", // NONE
|
||||
"ca", // CANONICAL
|
||||
|
@ -453,7 +466,7 @@ final class UCD_Names implements UCD_Types {
|
|||
"sq", // A CJK squared font variant.
|
||||
"fr", // A vulgar fraction form.
|
||||
};
|
||||
|
||||
|
||||
static private String[] MIRRORED_TABLE = {
|
||||
"N",
|
||||
"Y"
|
||||
|
@ -465,14 +478,14 @@ final class UCD_Names implements UCD_Types {
|
|||
"digit",
|
||||
"decimal",
|
||||
};
|
||||
|
||||
|
||||
static String[] SHORT_NT = {
|
||||
"",
|
||||
"nu",
|
||||
"di",
|
||||
"de",
|
||||
};
|
||||
|
||||
|
||||
static {
|
||||
if (LIMIT_CATEGORY != GC.length) {
|
||||
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: category");
|
||||
|
@ -511,9 +524,9 @@ final class UCD_Names implements UCD_Types {
|
|||
System.err.println("!! ERROR !! UnicodeTypes and UInfo out of sync: age");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static byte ON = Utility.lookup("ON", BC);
|
||||
|
||||
|
||||
public static String[] JOINING_TYPE = {
|
||||
"C",
|
||||
"D",
|
||||
|
@ -584,7 +597,7 @@ final class UCD_Names implements UCD_Types {
|
|||
"YUDH_HE",
|
||||
"ZAIN",
|
||||
};
|
||||
|
||||
|
||||
public static String[] OLD_JOINING_GROUP = {
|
||||
"<no shaping>",
|
||||
"AIN",
|
||||
|
@ -637,9 +650,9 @@ final class UCD_Names implements UCD_Types {
|
|||
"YUDH HE",
|
||||
"ZAIN",
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static String[] JAMO_L_TABLE = {
|
||||
// Value; Short Name; Unicode Name
|
||||
"G", // U+1100; G; HANGUL CHOSEONG KIYEOK
|
||||
|
@ -662,7 +675,7 @@ final class UCD_Names implements UCD_Types {
|
|||
"P", // U+1111; P; HANGUL CHOSEONG PHIEUPH
|
||||
"H" // U+1112; H; HANGUL CHOSEONG HIEUH
|
||||
};
|
||||
|
||||
|
||||
static String[] JAMO_V_TABLE = {
|
||||
// Value; Short Name; Unicode Name
|
||||
"A", // U+1161; A; HANGUL JUNGSEONG A
|
||||
|
@ -687,7 +700,7 @@ final class UCD_Names implements UCD_Types {
|
|||
"YI", // U+1174; YI; HANGUL JUNGSEONG YI
|
||||
"I", // U+1175; I; HANGUL JUNGSEONG I
|
||||
};
|
||||
|
||||
|
||||
static String[] JAMO_T_TABLE = {
|
||||
// Value; Short Name; Unicode Name
|
||||
"", // filler, for LV syllable
|
||||
|
@ -721,7 +734,7 @@ final class UCD_Names implements UCD_Types {
|
|||
};
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
static {
|
||||
UNASSIGNED_INFO.code = '\uFFFF';
|
||||
|
|
|
@ -1,10 +1,23 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
|
||||
* $Date: 2001/08/31 00:29:50 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
public interface UCD_Types {
|
||||
public static final String DATA_DIR = "C:\\DATA\\";
|
||||
public static final String BIN_DIR = DATA_DIR + "\\BIN\\";
|
||||
public static final String GEN_DIR = DATA_DIR + "\\GEN\\";
|
||||
|
||||
|
||||
|
||||
static final byte BINARY_FORMAT = 5; // bumped if binary format of UCD changes
|
||||
/*
|
||||
|
@ -24,7 +37,7 @@ public interface UCD_Types {
|
|||
13 Lower case equivalent mapping. Similar to 12. This field is informative.
|
||||
14 Title case equivalent mapping. Similar to 12. This field is informative.
|
||||
*/
|
||||
|
||||
|
||||
// Binary ENUM Grouping
|
||||
public static final int
|
||||
CATEGORY = 0,
|
||||
|
@ -41,9 +54,9 @@ public interface UCD_Types {
|
|||
AGE = 0xB00,
|
||||
NEXT_ENUM = 0x100,
|
||||
LIMIT_ENUM = AGE + 0x100;
|
||||
|
||||
|
||||
public static final int LIMIT_COMBINING_CLASS = 256;
|
||||
|
||||
|
||||
// getCategory
|
||||
public static final byte
|
||||
UNASSIGNED = 0,
|
||||
|
@ -78,7 +91,7 @@ public interface UCD_Types {
|
|||
INITIAL_PUNCTUATION = 29,
|
||||
FINAL_PUNCTUATION = 30,
|
||||
LIMIT_CATEGORY = FINAL_PUNCTUATION+1,
|
||||
|
||||
|
||||
// Unicode abbreviations
|
||||
Lu = UPPERCASE_LETTER,
|
||||
Ll = LOWERCASE_LETTER,
|
||||
|
@ -110,7 +123,7 @@ public interface UCD_Types {
|
|||
Sc = CURRENCY_SYMBOL,
|
||||
Sk = MODIFIER_SYMBOL,
|
||||
So = OTHER_SYMBOL;
|
||||
|
||||
|
||||
static final int
|
||||
LETTER_MASK = (1<<Lu) | (1<<Ll) | (1<<Lt) | (1<<Lm) | (1 << Lo),
|
||||
MARK_MASK = (1<<Mn) | (1<<Me) | (1<<Mc),
|
||||
|
@ -120,12 +133,12 @@ public interface UCD_Types {
|
|||
PUNCTUATION_MASK = (1<<Pc) | (1<<Pd) | (1<<Ps) | (1<<Pe) | (1<<Po) | (1<<Pi) | (1<<Pf),
|
||||
SYMBOL_MASK = (1<<Sm) | (1<<Sc) | (1<<Sk) | (1<<So),
|
||||
UNASSIGNED_MASK = (1<<Cn);
|
||||
|
||||
|
||||
// Binary Properties
|
||||
|
||||
|
||||
public static final byte
|
||||
BidiMirrored = 0,
|
||||
CompositionExclusion = 1,
|
||||
CompositionExclusion = 1,
|
||||
White_space = 2,
|
||||
Non_break = 3,
|
||||
Bidi_Control = 4,
|
||||
|
@ -154,11 +167,11 @@ public interface UCD_Types {
|
|||
Reserved_Cf_Code_Point = 27,
|
||||
Deprecated = 28,
|
||||
LIMIT_BINARY_PROPERTIES = 29;
|
||||
|
||||
|
||||
/*
|
||||
static final int
|
||||
BidiMirroredMask = 1<<BidiMirrored,
|
||||
CompositionExclusionMask = 1<<CompositionExclusion,
|
||||
CompositionExclusionMask = 1<<CompositionExclusion,
|
||||
AlphabeticMask = 1<<Alphabetic,
|
||||
Bidi_ControlMask = 1<<Bidi_Control,
|
||||
DashMask = 1<<Dash,
|
||||
|
@ -181,15 +194,15 @@ public interface UCD_Types {
|
|||
// line break
|
||||
public static final byte
|
||||
LBXX = 0, LBOP = 1, LBCL = 2, LBQU = 3, LBGL = 4, LBNS = 5, LBEX = 6, LBSY = 7,
|
||||
LBIS = 8, LBPR = 9, LBPO = 10, LBNU = 11, LBAL = 12, LBID = 13, LBIN = 14, LBHY = 15,
|
||||
LBCM = 16, LBBB = 17, LBBA = 18, LBSP = 19, LBBK = 20, LBCR = 21, LBLF = 22, LBCB = 23,
|
||||
LBIS = 8, LBPR = 9, LBPO = 10, LBNU = 11, LBAL = 12, LBID = 13, LBIN = 14, LBHY = 15,
|
||||
LBCM = 16, LBBB = 17, LBBA = 18, LBSP = 19, LBBK = 20, LBCR = 21, LBLF = 22, LBCB = 23,
|
||||
LBSA = 24, LBAI = 25, LBB2 = 26, LBSG = 27, LBZW = 28, LIMIT_LINE_BREAK = 29;
|
||||
|
||||
|
||||
// east asian width
|
||||
public static final byte
|
||||
EAN = 0, EAA = 1, EAH = 2, EAW = 3, EAF = 4, EANa = 5,
|
||||
EAN = 0, EAA = 1, EAH = 2, EAW = 3, EAF = 4, EANa = 5,
|
||||
LIMIT_EAST_ASIAN_WIDTH = 6;
|
||||
|
||||
|
||||
// bidi class
|
||||
static final byte
|
||||
BIDI_L = 0, // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
|
||||
|
@ -214,7 +227,7 @@ public interface UCD_Types {
|
|||
BIDI_RLE = 18,
|
||||
BIDI_PDF = 19,
|
||||
LIMIT_BIDI_CLASS = 20;
|
||||
|
||||
|
||||
// decompositionType
|
||||
static final byte NONE = 0,
|
||||
CANONICAL = 1,
|
||||
|
@ -239,7 +252,7 @@ public interface UCD_Types {
|
|||
|
||||
// mirrored type
|
||||
static final byte NO = 0, YES = 1, MIRRORED_LIMIT = 2;
|
||||
|
||||
|
||||
// for QuickCheck
|
||||
static final byte QNO = 0, QMAYBE = 1, QYES = 2;
|
||||
|
||||
|
@ -251,7 +264,7 @@ public interface UCD_Types {
|
|||
static final byte UNNORMALIZED = 0, C = 1, KC = 2, D = 3, KD = 4, FORM_LIMIT = 5;
|
||||
|
||||
// numericType
|
||||
static final byte NUMERIC_NONE = 0, NUMERIC = 1, DIGIT = 2, DECIMAL = 3,
|
||||
static final byte NUMERIC_NONE = 0, NUMERIC = 1, DIGIT = 2, DECIMAL = 3,
|
||||
LIMIT_NUMERIC_TYPE = 4;
|
||||
|
||||
public static final byte // SCRIPT CODE
|
||||
|
@ -263,7 +276,7 @@ public interface UCD_Types {
|
|||
HEBREW_SCRIPT = 5,
|
||||
ARABIC_SCRIPT = 6,
|
||||
SYRIAC_SCRIPT = 7,
|
||||
THAANA_SCRIPT = 8,
|
||||
THAANA_SCRIPT = 8,
|
||||
DEVANAGARI_SCRIPT = 9,
|
||||
BENGALI_SCRIPT = 10,
|
||||
GURMUKHI_SCRIPT = 11,
|
||||
|
@ -298,8 +311,8 @@ public interface UCD_Types {
|
|||
DESERET_SCRIPT = 40,
|
||||
INHERITED_SCRIPT = 41,
|
||||
LIMIT_SCRIPT = 42;
|
||||
|
||||
static final int
|
||||
|
||||
static final int
|
||||
UNKNOWN = 0,
|
||||
AGE10 = 1,
|
||||
AGE20 = 2,
|
||||
|
@ -307,9 +320,9 @@ public interface UCD_Types {
|
|||
AGE30 = 4,
|
||||
AGE31 = 5,
|
||||
LIMIT_AGE = 6;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
public static byte
|
||||
JT_C = 0,
|
||||
JT_D = 1,
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
|
||||
* $Date: 2001/08/31 00:29:50 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.*;
|
||||
|
@ -16,24 +29,24 @@ class UData implements UCD_Types {
|
|||
String fullCaseFolding;
|
||||
String specialCasing = "";
|
||||
String bidiMirror;
|
||||
|
||||
|
||||
int codePoint = -1;
|
||||
float numericValue = Float.NaN;
|
||||
int binaryProperties; // bidiMirroring, compositionExclusions, PropList
|
||||
|
||||
|
||||
byte generalCategory = Cn;
|
||||
byte combiningClass = 0;
|
||||
byte bidiClass = BIDI_ON;
|
||||
byte decompositionType = NONE;
|
||||
byte numericType = NUMERIC_NONE;
|
||||
|
||||
|
||||
byte eastAsianWidth = EAN;
|
||||
byte lineBreak = LBXX;
|
||||
byte joiningType = JT_U;
|
||||
byte joiningGroup = NO_SHAPING;
|
||||
byte script = COMMON_SCRIPT;
|
||||
byte age = 0;
|
||||
|
||||
|
||||
static final UData UNASSIGNED = new UData();
|
||||
//static final UData NONCHARACTER = new UData();
|
||||
static {
|
||||
|
@ -43,7 +56,7 @@ class UData implements UCD_Types {
|
|||
= UNASSIGNED.simpleLowercase
|
||||
= UNASSIGNED.simpleTitlecase = "";
|
||||
UNASSIGNED.fleshOut();
|
||||
|
||||
|
||||
/*NONCHARACTER.name = "<noncharacter>";
|
||||
NONCHARACTER.decompositionMapping = NONCHARACTER.bidiMirror
|
||||
= NONCHARACTER.simpleUppercase
|
||||
|
@ -54,14 +67,14 @@ class UData implements UCD_Types {
|
|||
NONCHARACTER.fleshOut();
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
public UData (int codePoint) {
|
||||
this.codePoint = codePoint;
|
||||
}
|
||||
|
||||
|
||||
public UData () {
|
||||
}
|
||||
|
||||
|
||||
public boolean equals(Object that) {
|
||||
UData other = (UData) that;
|
||||
if (!name.equals(other.name)) return false;
|
||||
|
@ -92,87 +105,87 @@ class UData implements UCD_Types {
|
|||
if (age != other.age) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public void fleshOut() {
|
||||
String codeValue = UTF32.valueOf32(codePoint);
|
||||
|
||||
|
||||
if (decompositionMapping == null) decompositionMapping = codeValue;
|
||||
if (bidiMirror == null) bidiMirror = codeValue;
|
||||
|
||||
|
||||
if (simpleLowercase == null) simpleLowercase = codeValue;
|
||||
if (simpleCaseFolding == null) simpleCaseFolding = simpleLowercase;
|
||||
if (fullLowercase == null) fullLowercase = simpleLowercase;
|
||||
if (fullCaseFolding == null) fullCaseFolding = fullLowercase;
|
||||
|
||||
|
||||
if (simpleUppercase == null) simpleUppercase = codeValue;
|
||||
if (simpleTitlecase == null) simpleTitlecase = codeValue;
|
||||
if (fullUppercase == null) fullUppercase = simpleUppercase;
|
||||
|
||||
|
||||
if (fullTitlecase == null) fullTitlecase = simpleTitlecase;
|
||||
}
|
||||
|
||||
|
||||
public void compact() {
|
||||
fleshOut();
|
||||
String codeValue = UTF32.valueOf32(codePoint);
|
||||
|
||||
|
||||
if (fullTitlecase.equals(simpleTitlecase)) fullTitlecase = null;
|
||||
|
||||
|
||||
if (fullUppercase.equals(simpleUppercase)) fullUppercase = null;
|
||||
if (simpleTitlecase.equals(codeValue)) simpleTitlecase = null;
|
||||
if (simpleUppercase.equals(codeValue)) simpleUppercase = null;
|
||||
|
||||
|
||||
if (fullCaseFolding.equals(fullLowercase)) fullCaseFolding = null;
|
||||
if (fullLowercase.equals(simpleLowercase)) fullLowercase = null;
|
||||
if (simpleCaseFolding.equals(simpleLowercase)) simpleCaseFolding = null;
|
||||
if (simpleLowercase.equals(codeValue)) simpleLowercase = null;
|
||||
|
||||
|
||||
if (decompositionMapping.equals(codeValue)) decompositionMapping = null;
|
||||
if (bidiMirror.equals(codeValue)) bidiMirror = null;
|
||||
}
|
||||
|
||||
|
||||
public void setBinaryProperties(int binaryProperties) {
|
||||
this.binaryProperties = binaryProperties;
|
||||
}
|
||||
|
||||
|
||||
public boolean isLetter() {
|
||||
return ((1<<generalCategory) & UCD_Types.LETTER_MASK) != 0;
|
||||
}
|
||||
|
||||
|
||||
public static void writeString(DataOutputStream os, String s) throws IOException {
|
||||
if (s == null) {
|
||||
os.writeByte(0);
|
||||
os.writeByte(0);
|
||||
} else {
|
||||
os.writeByte(1);
|
||||
os.writeUTF(s);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static final byte[] byteBuffer = new byte[256];
|
||||
|
||||
|
||||
public static String readString(DataInputStream is) throws IOException {
|
||||
int type = is.readUnsignedByte();
|
||||
if (type == 0) return null;
|
||||
return is.readUTF();
|
||||
}
|
||||
|
||||
|
||||
static final byte ABBREVIATED = 0, FULL = 1;
|
||||
|
||||
|
||||
public String toString() {
|
||||
return toString(FULL);
|
||||
}
|
||||
|
||||
|
||||
public String toString(byte style) {
|
||||
boolean full = style == FULL;
|
||||
StringBuffer result = new StringBuffer();
|
||||
String s = UTF32.valueOf32(codePoint);
|
||||
|
||||
|
||||
result.append("<e c='").append(Utility.quoteXML(codePoint)).append('\'');
|
||||
result.append(" hx='").append(Utility.hex(codePoint)).append('\'');
|
||||
if (full || script != COMMON_SCRIPT) result.append(" sn='").append(UCD_Names.SCRIPT[script]).append('\'');
|
||||
result.append(" n='").append(Utility.quoteXML(name)).append("'\r\n");
|
||||
|
||||
|
||||
int lastPos = result.length();
|
||||
|
||||
|
||||
if (full || generalCategory != Lo) result.append(" gc='").append(UCD_Names.GC[generalCategory]).append('\'');
|
||||
if (full || combiningClass != 0) result.append(" cc='").append(combiningClass & 0xFF).append('\'');
|
||||
if (full || decompositionType != NONE) result.append(" dt='").append(UCD_Names.DT[decompositionType]).append('\'');
|
||||
|
@ -180,21 +193,21 @@ class UData implements UCD_Types {
|
|||
|
||||
if (full || numericType != NUMERIC_NONE) result.append(" nt='").append(UCD_Names.NT[numericType]).append('\'');
|
||||
if (full || !Double.isNaN(numericValue)) result.append(" nv='").append(numericValue).append('\'');
|
||||
|
||||
|
||||
if (full || eastAsianWidth != EAN) result.append(" ea='").append(UCD_Names.EA[eastAsianWidth]).append('\'');
|
||||
if (full || lineBreak != LBAL) result.append(" lb='").append(UCD_Names.LB[lineBreak]).append('\'');
|
||||
if (full || joiningType != JT_U) result.append(" jt='").append(UCD_Names.JOINING_TYPE[joiningType]).append('\'');
|
||||
if (full || joiningGroup != NO_SHAPING) result.append(" jg='").append(UCD_Names.JOINING_GROUP[joiningGroup]).append('\'');
|
||||
if (full || age != 0) result.append(" ag='").append(UCD_Names.AGE[age]).append('\'');
|
||||
|
||||
|
||||
if (full || bidiClass != BIDI_L) result.append(" bc='").append(UCD_Names.BC[bidiClass]).append('\'');
|
||||
if (full || !bidiMirror.equals(s)) result.append(" bmg='").append(Utility.quoteXML(bidiMirror)).append('\'');
|
||||
|
||||
|
||||
if (lastPos != result.length()) {
|
||||
result.append("\r\n");
|
||||
lastPos = result.length();
|
||||
}
|
||||
|
||||
|
||||
//String bp = "";
|
||||
int bprops = binaryProperties;
|
||||
for (int i = 0; i < LIMIT_BINARY_PROPERTIES; ++i) {
|
||||
|
@ -204,26 +217,26 @@ class UData implements UCD_Types {
|
|||
result.append("\r\n");
|
||||
lastPos = result.length();
|
||||
}
|
||||
|
||||
|
||||
if (full || !fullLowercase.equals(s)) result.append(" lc='").append(Utility.quoteXML(fullLowercase)).append('\'');
|
||||
if (full || !fullUppercase.equals(simpleUppercase)) result.append(" uc='").append(Utility.quoteXML(fullUppercase)).append('\'');
|
||||
if (full || !fullTitlecase.equals(fullUppercase)) result.append(" tc='").append(Utility.quoteXML(fullTitlecase)).append('\'');
|
||||
if (full || !fullCaseFolding.equals(fullLowercase)) result.append(" cf='").append(Utility.quoteXML(fullCaseFolding)).append('\'');
|
||||
|
||||
|
||||
if (full || !simpleLowercase.equals(simpleLowercase)) result.append(" slc='").append(Utility.quoteXML(simpleLowercase)).append('\'');
|
||||
if (full || !simpleUppercase.equals(simpleUppercase)) result.append(" suc='").append(Utility.quoteXML(simpleUppercase)).append('\'');
|
||||
if (full || !simpleTitlecase.equals(simpleUppercase)) result.append(" stc='").append(Utility.quoteXML(simpleTitlecase)).append('\'');
|
||||
if (full || !simpleCaseFolding.equals(simpleLowercase)) result.append(" sfc='").append(Utility.quoteXML(simpleCaseFolding)).append('\'');
|
||||
|
||||
|
||||
if (full || !specialCasing.equals("")) result.append(" fsc='").append(Utility.quoteXML(specialCasing)).append('\'');
|
||||
result.append("/>");
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
public void writeBytes(DataOutputStream os) throws IOException {
|
||||
compact();
|
||||
os.writeInt(codePoint);
|
||||
|
||||
|
||||
writeString(os, name);
|
||||
writeString(os, decompositionMapping);
|
||||
writeString(os, simpleUppercase);
|
||||
|
@ -236,10 +249,10 @@ class UData implements UCD_Types {
|
|||
writeString(os, fullCaseFolding);
|
||||
writeString(os, specialCasing);
|
||||
writeString(os, bidiMirror);
|
||||
|
||||
|
||||
os.writeFloat(numericValue);
|
||||
os.writeInt(binaryProperties);
|
||||
|
||||
|
||||
os.writeByte(generalCategory);
|
||||
os.writeByte(combiningClass);
|
||||
os.writeByte(bidiClass);
|
||||
|
@ -252,10 +265,10 @@ class UData implements UCD_Types {
|
|||
os.writeByte(script);
|
||||
os.writeByte(age);
|
||||
}
|
||||
|
||||
|
||||
public void readBytes(DataInputStream is) throws IOException {
|
||||
codePoint = is.readInt();
|
||||
|
||||
|
||||
name = readString(is);
|
||||
decompositionMapping = readString(is);
|
||||
simpleUppercase = readString(is);
|
||||
|
@ -268,10 +281,10 @@ class UData implements UCD_Types {
|
|||
fullCaseFolding = readString(is);
|
||||
specialCasing = readString(is);
|
||||
bidiMirror = readString(is);
|
||||
|
||||
|
||||
numericValue = is.readFloat();
|
||||
binaryProperties = is.readInt();
|
||||
|
||||
|
||||
generalCategory = is.readByte();
|
||||
combiningClass = is.readByte();
|
||||
bidiClass = is.readByte();
|
||||
|
@ -284,7 +297,7 @@ class UData implements UCD_Types {
|
|||
script = is.readByte();
|
||||
age = is.readByte();
|
||||
fleshOut();
|
||||
|
||||
|
||||
// HACK
|
||||
/*
|
||||
int bp = binaryProperties;
|
||||
|
@ -300,7 +313,7 @@ class UData implements UCD_Types {
|
|||
binaryProperties = bp;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
if (generalCategory == Sm) {
|
||||
if ((binaryProperties & Math_PropertyMask) != 0) {
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
|
||||
* $Date: 2001/08/31 00:29:50 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -11,25 +24,25 @@ import java.io.*;
|
|||
import com.ibm.text.utility.*;
|
||||
|
||||
public class VerifyUCD implements UCD_Types {
|
||||
|
||||
|
||||
public static final String IDN_DIR = DATA_DIR + "\\IDN\\";
|
||||
static String ucdVersion = "";
|
||||
|
||||
|
||||
public static void main (String[] args) throws Exception {
|
||||
|
||||
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
String arg = args[i];
|
||||
if (arg.charAt(0) == '#') return; // skip rest of line
|
||||
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("Argument: " + args[i]);
|
||||
|
||||
|
||||
if (arg.equalsIgnoreCase("all")) {
|
||||
//checkCase();
|
||||
checkCanonicalProperties();
|
||||
CheckCaseFold();
|
||||
checkAgainstUInfo();
|
||||
|
||||
|
||||
} else if (arg.equalsIgnoreCase("build")) {
|
||||
ConvertUCD.main(new String[]{ucdVersion});
|
||||
} else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i];
|
||||
|
@ -46,9 +59,9 @@ public class VerifyUCD implements UCD_Types {
|
|||
else if (arg.equalsIgnoreCase("IdentifierTest")) IdentifierTest();
|
||||
else if (arg.equalsIgnoreCase("GenerateData")) GenerateData.main(Utility.split(args[++i],','));
|
||||
else if (arg.equalsIgnoreCase("BuildNames")) BuildNames.main(null);
|
||||
else if (arg.equalsIgnoreCase("writeNormalizerTestSuite"))
|
||||
else if (arg.equalsIgnoreCase("writeNormalizerTestSuite"))
|
||||
GenerateData.writeNormalizerTestSuite("NormalizationTest-3.1.1d1.txt");
|
||||
|
||||
|
||||
else {
|
||||
System.out.println("Unknown option -- must be one of the following (case-insensitive)");
|
||||
System.out.println("generateXML, checkCase, checkCanonicalProperties, CheckCaseFold,");
|
||||
|
@ -58,7 +71,7 @@ public class VerifyUCD implements UCD_Types {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
System.out.println(ucd.toString(0x0387));
|
||||
System.out.println(ucd.toString(0x00B7));
|
||||
|
@ -70,7 +83,7 @@ public class VerifyUCD implements UCD_Types {
|
|||
System.out.println(ucd.toString(0x0131));
|
||||
System.out.println(ucd.toString(0x0345));
|
||||
*/
|
||||
|
||||
|
||||
static void checkAgainstOtherVersion(String otherVersion) {
|
||||
ucd = UCD.make(ucdVersion);
|
||||
UCD ucd2 = UCD.make(otherVersion);
|
||||
|
@ -85,15 +98,15 @@ public class VerifyUCD implements UCD_Types {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void generateXML() throws IOException {
|
||||
ucd = UCD.make(ucdVersion);
|
||||
String filename = "UCD.xml";
|
||||
PrintWriter log = Utility.openPrintWriter(filename);
|
||||
|
||||
|
||||
//log.println('\uFEFF');
|
||||
log.println("<ucd>");
|
||||
|
||||
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
if (!ucd.isRepresented(cp)) continue;
|
||||
|
@ -102,13 +115,13 @@ public class VerifyUCD implements UCD_Types {
|
|||
}
|
||||
log.println(ucd.toString(cp));
|
||||
}
|
||||
|
||||
|
||||
log.println("</ucd>");
|
||||
log.close();
|
||||
}
|
||||
|
||||
|
||||
static final byte MIXED = (byte)(UNCASED + 1);
|
||||
|
||||
|
||||
public static void checkCase() throws IOException {
|
||||
Utility.fixDot();
|
||||
System.out.println("checkCase");
|
||||
|
@ -117,7 +130,7 @@ public class VerifyUCD implements UCD_Types {
|
|||
System.out.println(ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE));
|
||||
String fileName = "CaseDifferences.txt";
|
||||
PrintWriter log = Utility.openPrintWriter(fileName);
|
||||
|
||||
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
if (!ucd.isRepresented(cp) || ucd.isPUA(cp)) continue;
|
||||
|
@ -128,13 +141,13 @@ public class VerifyUCD implements UCD_Types {
|
|||
String xu = ucd.getCase(x, FULL, UPPER);
|
||||
String xl = ucd.getCase(x, FULL, LOWER);
|
||||
String xt = ucd.getCase(x, FULL, TITLE);
|
||||
|
||||
|
||||
byte caseCat = MIXED;
|
||||
if (xu.equals(xl)) caseCat = UNCASED;
|
||||
else if (x.equals(xl)) caseCat = LOWER;
|
||||
else if (x.equals(xu)) caseCat = UPPER;
|
||||
else if (x.equals(xt)) caseCat = TITLE;
|
||||
|
||||
|
||||
byte cat = ucd.getCategory(cp);
|
||||
boolean otherLower = ucd.getBinaryProperty(cp, Other_Lowercase);
|
||||
boolean otherUpper = ucd.getBinaryProperty(cp, Other_Uppercase);
|
||||
|
@ -142,15 +155,15 @@ public class VerifyUCD implements UCD_Types {
|
|||
: (cat == Ll || otherLower) ? LOWER
|
||||
: (cat == Lt) ? TITLE
|
||||
: UNCASED;
|
||||
|
||||
|
||||
if (caseCat != oldCaseCat) {
|
||||
log.println(UTF32.valueOf32(cp)
|
||||
+ "\t" + names[caseCat]
|
||||
+ "\t" + names[caseCat]
|
||||
+ "\t" + names[oldCaseCat]
|
||||
+ "\t" + ucd.getCategoryID_fromIndex(cat)
|
||||
+ "\t" + lowerNames[otherLower ? 1 : 0]
|
||||
+ "\t" + upperNames[otherUpper ? 1 : 0]
|
||||
+ "\t" + ucd.getCodeAndName(cp)
|
||||
+ "\t" + ucd.getCategoryID_fromIndex(cat)
|
||||
+ "\t" + lowerNames[otherLower ? 1 : 0]
|
||||
+ "\t" + upperNames[otherUpper ? 1 : 0]
|
||||
+ "\t" + ucd.getCodeAndName(cp)
|
||||
+ "\t" + ucd.getCodeAndName(x)
|
||||
+ "\t" + ucd.getCodeAndName(xu)
|
||||
+ "\t" + ucd.getCodeAndName(xl)
|
||||
|
@ -158,10 +171,10 @@ public class VerifyUCD implements UCD_Types {
|
|||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
log.close();
|
||||
}
|
||||
|
||||
|
||||
public static void checkCase2() throws IOException {
|
||||
Utility.fixDot();
|
||||
System.out.println("checkCase");
|
||||
|
@ -170,52 +183,52 @@ public class VerifyUCD implements UCD_Types {
|
|||
System.out.println(ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE));
|
||||
String fileName = "CaseNormalizationDifferences.txt";
|
||||
PrintWriter log = Utility.openPrintWriter(fileName);
|
||||
|
||||
|
||||
log.println("Differences between case(normalize(cp)) and normalize(case(cp))");
|
||||
log.println("u, l, t - upper, lower, title");
|
||||
log.println("c, d - nfc, nfd");
|
||||
|
||||
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
if (!ucd.isRepresented(cp) || ucd.isPUA(cp)) continue;
|
||||
if (cp == '\u3371') {
|
||||
System.out.println("debug");
|
||||
}
|
||||
|
||||
|
||||
String x = UTF32.valueOf32(cp);
|
||||
|
||||
|
||||
String ux = ucd.getCase(x, FULL, UPPER);
|
||||
String lx = ucd.getCase(x, FULL, LOWER);
|
||||
String tx = ucd.getCase(x, FULL, TITLE);
|
||||
|
||||
|
||||
String dux = nfd.normalize(ux);
|
||||
String dlx = nfd.normalize(lx);
|
||||
String dtx = nfd.normalize(tx);
|
||||
|
||||
|
||||
String cux = nfc.normalize(ux);
|
||||
String clx = nfc.normalize(lx);
|
||||
String ctx = nfc.normalize(tx);
|
||||
|
||||
|
||||
String dx = nfd.normalize(cp);
|
||||
String cx = nfc.normalize(cp);
|
||||
|
||||
|
||||
String udx = ucd.getCase(dx, FULL, UPPER);
|
||||
String ldx = ucd.getCase(dx, FULL, LOWER);
|
||||
String tdx = ucd.getCase(dx, FULL, TITLE);
|
||||
|
||||
|
||||
String ucx = ucd.getCase(cx, FULL, UPPER);
|
||||
String lcx = ucd.getCase(cx, FULL, LOWER);
|
||||
String tcx = ucd.getCase(cx, FULL, TITLE);
|
||||
|
||||
|
||||
String dudx = nfd.normalize(udx);
|
||||
String dldx = nfd.normalize(ldx);
|
||||
String dtdx = nfd.normalize(tdx);
|
||||
|
||||
|
||||
String cucx = nfc.normalize(ucx);
|
||||
String clcx = nfc.normalize(lcx);
|
||||
String ctcx = nfc.normalize(tcx);
|
||||
|
||||
|
||||
|
||||
|
||||
if (!dux.equals(udx)
|
||||
|| !dlx.equals(ldx)
|
||||
|| !dtx.equals(tdx)
|
||||
|
@ -236,7 +249,7 @@ public class VerifyUCD implements UCD_Types {
|
|||
if (!tx.equals(ux)) log.println("\tt(cp):\t" + ucd.getCodeAndName(tx));
|
||||
if (!x.equals(dx)) log.println("\td(cp):\t" + ucd.getCodeAndName(dx));
|
||||
if (!x.equals(cx)) log.println("\tc(cp):\t" + ucd.getCodeAndName(cx));
|
||||
|
||||
|
||||
if (!dux.equals(udx)) {
|
||||
log.println();
|
||||
log.println("\td(u(cp)):\t" + ucd.getCodeAndName(dux));
|
||||
|
@ -252,7 +265,7 @@ public class VerifyUCD implements UCD_Types {
|
|||
log.println("\td(t(cp)):\t" + ucd.getCodeAndName(dtx));
|
||||
log.println("\tt(d(cp)):\t" + ucd.getCodeAndName(tdx));
|
||||
}
|
||||
|
||||
|
||||
if (!cux.equals(ucx)) {
|
||||
log.println();
|
||||
log.println("\tc(u(cp)):\t" + ucd.getCodeAndName(cux));
|
||||
|
@ -268,9 +281,9 @@ public class VerifyUCD implements UCD_Types {
|
|||
log.println("\tc(t(cp)):\t" + ucd.getCodeAndName(ctx));
|
||||
log.println("\tt(c(cp)):\t" + ucd.getCodeAndName(tcx));
|
||||
}
|
||||
|
||||
|
||||
// ...........
|
||||
|
||||
|
||||
if (!udx.equals(dudx)) {
|
||||
log.println();
|
||||
log.println("\tu(d(cp)):\t" + ucd.getCodeAndName(udx));
|
||||
|
@ -286,7 +299,7 @@ public class VerifyUCD implements UCD_Types {
|
|||
log.println("\tt(d(cp)):\t" + ucd.getCodeAndName(tdx));
|
||||
log.println("\td(t(d(cp))):\t" + ucd.getCodeAndName(dtdx));
|
||||
}
|
||||
|
||||
|
||||
if (!ucx.equals(cucx)) {
|
||||
log.println();
|
||||
log.println("\tu(c(cp)):\t" + ucd.getCodeAndName(ucx));
|
||||
|
@ -304,14 +317,14 @@ public class VerifyUCD implements UCD_Types {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
log.close();
|
||||
}
|
||||
|
||||
|
||||
static final String names[] = {"LOWER", "TITLE", "UPPER", "(UNC)", "MIXED"};
|
||||
static final String lowerNames[] = {"", "Other_Lower"};
|
||||
static final String upperNames[] = {"", "Other_Upper"};
|
||||
|
||||
|
||||
public static void CheckCaseFold() {
|
||||
ucd = UCD.make(ucdVersion);
|
||||
System.out.println("Checking Case Fold");
|
||||
|
@ -320,10 +333,10 @@ public class VerifyUCD implements UCD_Types {
|
|||
if (!ucd.isAssigned(cp) || ucd.isPUA(cp)) continue;
|
||||
String fullTest = ucd.getCase(ucd.getCase(cp, FULL, UPPER), FULL, LOWER);
|
||||
String simpleTest = ucd.getCase(ucd.getCase(cp, SIMPLE, UPPER), SIMPLE, LOWER);
|
||||
|
||||
|
||||
String full = ucd.getCase(cp, FULL, FOLD);
|
||||
String simple = ucd.getCase(cp, SIMPLE, FOLD);
|
||||
|
||||
|
||||
boolean failed = false;
|
||||
if (!full.equals(fullTest)) {
|
||||
Utility.fixDot();
|
||||
|
@ -342,29 +355,29 @@ public class VerifyUCD implements UCD_Types {
|
|||
if (failed) System.out.println();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void VerifyIDN() throws IOException {
|
||||
System.out.println("VerifyIDN");
|
||||
ucd = UCD.make(ucdVersion);
|
||||
initNormalizers();
|
||||
|
||||
|
||||
System.out.println();
|
||||
System.out.println("Checking Map");
|
||||
System.out.println();
|
||||
|
||||
|
||||
BitSet mappedOut = new BitSet();
|
||||
int errorCount = verifyUTFMap(mappedOut);
|
||||
|
||||
|
||||
BitSet unassigned = getIDNList("IDN-Unassigned.txt");
|
||||
BitSet prohibited = getIDNList("IDN-Prohibited.txt");
|
||||
BitSet guessSet = guessIDN();
|
||||
|
||||
|
||||
System.out.println();
|
||||
System.out.println("Checking Prohibited and Unassigned");
|
||||
System.out.println();
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
if (mappedOut.get(cp)) continue;
|
||||
|
||||
|
||||
boolean ucdUnassigned = !ucd.isAllocated(cp);
|
||||
boolean idnUnassigned = unassigned.get(cp);
|
||||
boolean guess = guessSet.get(cp);
|
||||
|
@ -377,12 +390,12 @@ public class VerifyUCD implements UCD_Types {
|
|||
showError("Not UCD Unassigned but IDN Unassigned: ", cp);
|
||||
++errorCount;
|
||||
}
|
||||
|
||||
|
||||
if (idnProhibited && unassigned.get(cp)) {
|
||||
showError("Both IDN Unassigned AND IDN Prohibited: ", cp);
|
||||
++errorCount;
|
||||
}
|
||||
|
||||
|
||||
if (guess && !idnProhibited) {
|
||||
showError("UCD ?prohibited? but not IDN Prohibited: ", cp);
|
||||
++errorCount;
|
||||
|
@ -390,72 +403,72 @@ public class VerifyUCD implements UCD_Types {
|
|||
showError("Not UCD ?prohibited? but IDN Prohibited: ", cp);
|
||||
++errorCount;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
System.out.println();
|
||||
System.out.println("Total Errors: " + errorCount);
|
||||
}
|
||||
|
||||
|
||||
static void showError(String description, int cp) {
|
||||
System.out.println(description + ucd.getCodeAndName(cp) + " (" + ucd.getCategoryID(cp) + ")");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public static BitSet guessIDN() {
|
||||
BitSet result = new BitSet();
|
||||
for (int cp = 0; cp < 0x10FFFF; ++cp) {
|
||||
int cat = ucd.getCategory(cp);
|
||||
// 5.1 Currently-prohibited ASCII characters
|
||||
|
||||
|
||||
if (cp < 0x80 && cp != '-' && !(cat == Lu || cat == Ll || cat == Nd)) result.set(cp);
|
||||
|
||||
|
||||
// 5.2 Space characters
|
||||
|
||||
|
||||
if (cat == Zs) result.set(cp);
|
||||
|
||||
|
||||
// 5.3 Control characters
|
||||
if (cat == Cc || cat == Zp || cat == Zl) result.set(cp);
|
||||
|
||||
|
||||
// exclude those reserved for Cf
|
||||
/*if (0x2060 <= cp && cp <= 0x206F) result.set(cp);
|
||||
if (0xFFF0 <= cp && cp <= 0xFFFC) result.set(cp);
|
||||
if (0xE0000 <= cp && cp <= 0xE0FFF) result.set(cp);
|
||||
*/
|
||||
|
||||
|
||||
// 5.4 Private use and replacement characters
|
||||
|
||||
|
||||
if (cat == Co) result.set(cp);
|
||||
if (cp == 0xFFFD) result.set(cp);
|
||||
|
||||
|
||||
// 5.5 Non-character code points
|
||||
if (ucd.getBinaryProperty(cp, Noncharacter_Code_Point)) result.set(cp);
|
||||
|
||||
|
||||
// 5.6 Surrogate codes
|
||||
if (cat == Cs) result.set(cp);
|
||||
|
||||
|
||||
// 5.7 Inappropriate for plain text
|
||||
|
||||
|
||||
if (cat == Cf) result.set(cp);
|
||||
if (cp == 0xFFFC) result.set(cp);
|
||||
|
||||
|
||||
// 5.8 Inappropriate for domain names
|
||||
|
||||
|
||||
if (isIDS(cp)) result.set(cp);
|
||||
|
||||
|
||||
// 5.9 Change display properties
|
||||
// Cf, checked above
|
||||
|
||||
|
||||
// 5.10 Inappropriate characters from common input mechanisms
|
||||
if (cp == 0x3002) result.set(cp);
|
||||
|
||||
|
||||
// 5.11 Tagging characters
|
||||
// Cf, checked above
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static boolean isIDS(int cp) { return 0x2FF0 <= cp && cp <= 0x2FFB; }
|
||||
|
||||
|
||||
|
||||
/*
|
||||
5.1 Currently-prohibited ASCII characters
|
||||
|
@ -610,8 +623,8 @@ The following characters are used for tagging text and are invisible.
|
|||
E0001; LANGUAGE TAG
|
||||
E0020-E007F; [TAGGING CHARACTERS]
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
public static int verifyUTFMap(BitSet mappedOut) throws IOException {
|
||||
int errorCount = 0;
|
||||
BufferedReader input = new BufferedReader(new FileReader(IDN_DIR + "IDN-Mapping.txt"),32*1024);
|
||||
|
@ -627,9 +640,9 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
Utility.fixDot();
|
||||
System.out.println("//" + lineNumber + ": '" + line + "'");
|
||||
}
|
||||
|
||||
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
|
||||
int count = Utility.split(line,';',parts);
|
||||
if (count != 3) throw new ChainException("Incorrect # of fields in IDN folding", null);
|
||||
|
||||
|
@ -650,12 +663,12 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
idnFold.put(key, value);
|
||||
idnWhy.put(key, reason);
|
||||
}
|
||||
|
||||
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
if (!ucd.isAssigned(cp) || ucd.isPUA(cp)) continue;
|
||||
if (mappedOut.get(cp)) continue;
|
||||
|
||||
|
||||
String key = UTF32.valueOf32(cp);
|
||||
String value = (String)idnFold.get(key);
|
||||
if (value == null) value = key;
|
||||
|
@ -667,7 +680,7 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
|
||||
if (c.equals(value)) continue;
|
||||
Utility.fixDot();
|
||||
|
||||
|
||||
System.out.println("Mismatch: " + ucd.getCodeAndName(cp));
|
||||
System.out.println(" UCD Case Fold: <" + ucd.getCodeAndName(ucdFold) + ">");
|
||||
System.out.println(" IDN Map [" + reason + "]: <" + ucd.getCodeAndName(value) + ">");
|
||||
|
@ -679,7 +692,7 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
}
|
||||
return errorCount;
|
||||
}
|
||||
|
||||
|
||||
static BitSet getIDNList(String file) throws IOException {
|
||||
BufferedReader input = new BufferedReader(new FileReader(IDN_DIR + file),32*1024);
|
||||
BitSet result = new BitSet();
|
||||
|
@ -693,14 +706,14 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
Utility.fixDot();
|
||||
System.out.println("//" + lineNumber + ": '" + line + "'");
|
||||
}
|
||||
|
||||
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
|
||||
int count = Utility.split(line,'-',parts);
|
||||
if (count > 2) throw new ChainException("Incorrect # of fields in IDN list", null);
|
||||
int start = Utility.codePointFromHex(parts[0]);
|
||||
int end = count == 1 ? start : Utility.codePointFromHex(parts[1]);
|
||||
|
||||
|
||||
for (int i = start; i <= end; ++i) {
|
||||
result.set(i);
|
||||
}
|
||||
|
@ -710,12 +723,12 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
private static void IdentifierTest() {
|
||||
String x = normalize(UTF32.valueOf32(0x10300), 4) ;
|
||||
getCategoryID(x);
|
||||
|
||||
|
||||
/*
|
||||
Changes Category: U+10300 OLD ITALIC LETTER A
|
||||
nfx_cp: U+D800 <surrogate-D800>
|
||||
|
@ -724,7 +737,7 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
isIdentifierStart(cp, true): true
|
||||
cat(cp): Lo
|
||||
*/
|
||||
|
||||
|
||||
for (int j = 0; j < 5; ++j) {
|
||||
System.out.println();
|
||||
System.out.println("Testing Identifier Closure for " + NAMES[j]);
|
||||
|
@ -734,11 +747,11 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
if (!ucd.isAssigned(cp)) continue;
|
||||
if (ucd.isPUA(cp)) continue;
|
||||
if (!normalizationDiffers(cp, j)) continue;
|
||||
|
||||
|
||||
if (cp == 0xFDFB || cp == 0x0140) {
|
||||
System.out.println("debug point");
|
||||
}
|
||||
|
||||
|
||||
boolean norm;
|
||||
boolean plain;
|
||||
|
||||
|
@ -750,15 +763,15 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
Utility.fixDot();
|
||||
System.out.println("*Not Identifier: " + ucd.getCodeAndName(cp));
|
||||
System.out.println(" nfx_x_cp: " + ucd.getCodeAndName(nfx_x_cp));
|
||||
|
||||
|
||||
System.out.println(" isIdentifier(nfx_x_cp, true): " + norm);
|
||||
System.out.println(" cat(nfx_x_cp): " + getCategoryID(nfx_x_cp));
|
||||
|
||||
|
||||
System.out.println(" isIdentifier(x_cp, true): " + plain);
|
||||
System.out.println(" cat(x_cp): " + getCategoryID(x_cp));
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
String nfx_cp = normalize(UTF32.valueOf32(cp), j);
|
||||
plain = ucd.isIdentifierStart(cp, true);
|
||||
norm = ucd.isIdentifier(nfx_cp, true);
|
||||
|
@ -766,10 +779,10 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
Utility.fixDot();
|
||||
System.out.println(" Changes Category: " + ucd.getCodeAndName(cp));
|
||||
System.out.println(" nfx_cp: " + ucd.getCodeAndName(nfx_cp));
|
||||
|
||||
|
||||
System.out.println(" isIdentifier(nfx_cp, true): " + norm);
|
||||
System.out.println(" cat(nfx_cp): " + getCategoryID(nfx_cp));
|
||||
|
||||
|
||||
System.out.println(" isIdentifierStart(cp, true): " + plain);
|
||||
System.out.println(" cat(cp): " + ucd.getCategoryID(cp));
|
||||
System.out.println();
|
||||
|
@ -778,7 +791,7 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static String getCategoryID(String s) {
|
||||
if (UTF32.length32(s) == 1) return ucd.getCategoryID(UTF32.char32At(s, 0));
|
||||
StringBuffer result = new StringBuffer();
|
||||
|
@ -790,30 +803,30 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
static String normalize(String s, int j) {
|
||||
if (j < 4) return nf[j].normalize(s);
|
||||
return ucd.getCase(s, FULL, FOLD);
|
||||
}
|
||||
|
||||
|
||||
static boolean normalizationDiffers(int cp, int j) {
|
||||
if (j < 4) return nf[j].normalizationDiffers(cp);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
private static Normalizer[] nf = new Normalizer[4];
|
||||
private static Normalizer nfd, nfc, nfkd, nfkc;
|
||||
|
||||
|
||||
static void initNormalizers() {
|
||||
nfd = nf[0] = new Normalizer(Normalizer.NFD);
|
||||
nfc = nf[1] = new Normalizer(Normalizer.NFC);
|
||||
nfkd = nf[2] = new Normalizer(Normalizer.NFKD);
|
||||
nfkc = nf[3] = new Normalizer(Normalizer.NFKC);
|
||||
}
|
||||
|
||||
|
||||
private static UCD ucd;
|
||||
private static final String[] NAMES = {"NFD", "NFC", "NFKD", "NFKC", "Fold"};
|
||||
|
||||
|
||||
private static void NFTest() {
|
||||
initNormalizers();
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
|
@ -834,10 +847,10 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
+ ", call: " + call + " " + ucd.getCodeAndName(i));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void checkScripts() {
|
||||
ucd = UCD.make(ucdVersion);
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
|
@ -847,21 +860,21 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void checkAgainstUInfo() {
|
||||
/*
|
||||
ucd = UCD.make(ucdVersion);
|
||||
UData x = new UData();
|
||||
x.fleshOut();
|
||||
|
||||
|
||||
System.out.println(ucd.toString(0x1E0A));
|
||||
|
||||
|
||||
UInfo.init();
|
||||
System.out.println("Cross-checking against old implementation");
|
||||
System.out.println("Version: " + ucd.getVersion() + ", " + new Date(ucd.getDate()));
|
||||
for (int i = 0; i <= 0xFFFF; ++i) {
|
||||
Utility.dot(i);
|
||||
|
||||
|
||||
if ((i & 0x0FFF) == 0) System.out.println("#" + Utility.hex(i));
|
||||
try {
|
||||
check(i, ucd.getName(i), UInfo.getName((char)i), "Name");
|
||||
|
@ -872,12 +885,12 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
check(i, ucd.getDecompositionType(i), UInfo.getDecompositionType((char)i), UCD_Names.DT, "DecompositionType");
|
||||
check(i, ucd.getNumericValue(i), UInfo.getNumeric((char)i), "NumericValue");
|
||||
check(i, ucd.getNumericType(i), UInfo.getNumericType((char)i), UCD_Names.NT, "NumericType");
|
||||
|
||||
|
||||
check(i, ucd.getCase(i, SIMPLE, LOWER), UInfo.getLowercase((char)i), "SimpleLowercase");
|
||||
check(i, ucd.getCase(i, SIMPLE, UPPER), UInfo.getUppercase((char)i), "SimpleUppercase");
|
||||
check(i, ucd.getCase(i, SIMPLE, TITLE), UInfo.getTitlecase((char)i), "SimpleTitlecase");
|
||||
//check(i, ucd.getSimpleCaseFolding(i), UInfo.getSimpleCaseFolding((char)i));
|
||||
|
||||
|
||||
if (ucd.getSpecialCase(i).length() == 0) { // NORMAL
|
||||
check(i, ucd.getCase(i, FULL, LOWER), UInfo.toLowercase((char)i, ""), "FullLowercase");
|
||||
check(i, ucd.getCase(i, FULL, UPPER), UInfo.toUppercase((char)i, ""), "FullUppercase");
|
||||
|
@ -888,18 +901,18 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
check(i, ucd.getCase(i, SIMPLE, TITLE), UInfo.toTitlecase((char)i, ""), "FullTitlecase");
|
||||
}
|
||||
// check(i, ucd.getFullCaseFolding(i), UInfo.getFullCaseFolding((char)i));
|
||||
|
||||
|
||||
check(i, ucd.getSpecialCase(i).toUpperCase(), UInfo.getCaseCondition((char)i).toUpperCase(), "SpecialCase");
|
||||
check(i, ucd.getLineBreak(i), UInfo.getLineBreakType((char)i), UCD_Names.LB, "LineBreak");
|
||||
check(i, ucd.getEastAsianWidth(i), UInfo.getEastAsianWidthType((char)i), UCD_Names.EA, "EastAsian");
|
||||
|
||||
|
||||
int props = ucd.getBinaryProperties(i);
|
||||
check(i, (props>>BidiMirrored) & 1, UInfo.getMirrored((char)i), UCD_Names.YN_TABLE, "BidiMirroring");
|
||||
check(i, (props>>CompositionExclusion) & 1, UInfo.isCompositionExcluded((char)i)?1:0, UCD_Names.YN_TABLE, "Comp-Exclusion");
|
||||
|
||||
|
||||
} catch (Exception e) {
|
||||
Utility.fixDot();
|
||||
|
||||
|
||||
System.out.println("Error: " + Utility.hex(i) + " " + e.getClass().getName() + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
@ -907,38 +920,38 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
*/
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static void check(int cp, boolean x, boolean y, String[] names, String type) {
|
||||
check(cp, x ? 1 : 0, y ? 1 : 0, names, type);
|
||||
}
|
||||
|
||||
|
||||
public static void check(int cp, int x, int y, String[] names, String type) {
|
||||
if (x == y) return;
|
||||
showLast(cp);
|
||||
Utility.fixDot();
|
||||
System.out.println(" " + type + ": "
|
||||
+ Utility.getName(x, names) + " (" + x + ") " + " != "
|
||||
System.out.println(" " + type + ": "
|
||||
+ Utility.getName(x, names) + " (" + x + ") " + " != "
|
||||
+ Utility.getName(y, names) + " (" + y + ") ") ;
|
||||
}
|
||||
|
||||
|
||||
public static void check(int cp, int x, int y, String type) {
|
||||
if (x == y) return;
|
||||
showLast(cp);
|
||||
Utility.fixDot();
|
||||
System.out.println(" " + type + ": " + x + " != " + y) ;
|
||||
}
|
||||
|
||||
|
||||
public static void check(int cp, float x, float y, String type) {
|
||||
if (!(x > y) && !(x < y)) return; // funny syntax to catch NaN
|
||||
showLast(cp);
|
||||
Utility.fixDot();
|
||||
System.out.println(" " + type + ": " + x + " != " + y) ;
|
||||
}
|
||||
|
||||
|
||||
public static void check(int cp, String x, String y, String type) {
|
||||
if (x != null && x.equals(y)) return;
|
||||
if (x != null && y != null
|
||||
&& x.length() > 0 && y.length() > 0
|
||||
if (x != null && y != null
|
||||
&& x.length() > 0 && y.length() > 0
|
||||
&& x.charAt(0) == '<' && y.charAt(0) == '<') {
|
||||
if (x.startsWith("<unassigned") && y.equals("<reserved>")) return;
|
||||
if (y.equals("<control>")) return;
|
||||
|
@ -949,11 +962,11 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
Utility.fixDot();
|
||||
System.out.println(" " + type + ": " + Utility.quoteJavaString(x) + " != " + Utility.quoteJavaString(y));
|
||||
}
|
||||
|
||||
|
||||
|
||||
static int lastShowed = -1;
|
||||
static boolean showCanonicalDecomposition = false;
|
||||
|
||||
|
||||
static void showLast(int cp) {
|
||||
if (lastShowed != cp) {
|
||||
Utility.fixDot();
|
||||
|
@ -967,14 +980,14 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
lastShowed = cp;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void test1() {
|
||||
ucd = UCD.make(ucdVersion);
|
||||
|
||||
|
||||
for (int i = 0x19; i < 0x10FFFF; ++i) {
|
||||
|
||||
|
||||
System.out.println(Utility.hex(i) + " " + Utility.quoteJavaString(ucd.getName(i)));
|
||||
|
||||
|
||||
System.out.print(" "
|
||||
+ ", gc=" + ucd.getCategoryID(i)
|
||||
+ ", bc=" + ucd.getBidiClassID(i)
|
||||
|
@ -989,7 +1002,7 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
if (ucd.getBinaryProperty(i,j)) System.out.print(", " + UCD_Names.BP[j]);
|
||||
}
|
||||
System.out.println();
|
||||
|
||||
|
||||
System.out.println(" "
|
||||
+ ", dm=" + Utility.quoteJavaString(ucd.getDecompositionMapping(i))
|
||||
+ ", slc=" + Utility.quoteJavaString(ucd.getCase(i, SIMPLE, LOWER))
|
||||
|
@ -1000,15 +1013,15 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
+ ", fuc=" + Utility.quoteJavaString(ucd.getCase(i, FULL, UPPER))
|
||||
+ ", sc=" + Utility.quoteJavaString(ucd.getSpecialCase(i))
|
||||
);
|
||||
|
||||
|
||||
if (i > 0x180) i = 3 * i / 2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void checkCanonicalProperties() {
|
||||
ucd = UCD.make(ucdVersion);
|
||||
System.out.println(ucd.toString(0x1E0A));
|
||||
|
||||
|
||||
System.out.println("Cross-checking canonical equivalence");
|
||||
System.out.println("Version: " + ucd.getVersion() + ", " + new Date(ucd.getDate()));
|
||||
showCanonicalDecomposition = true;
|
||||
|
@ -1020,7 +1033,7 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
}
|
||||
byte type = ucd.getDecompositionType(i);
|
||||
if (type != CANONICAL) continue;
|
||||
|
||||
|
||||
String s = ucd.getDecompositionMapping(i);
|
||||
int slen = UTF32.length32(s);
|
||||
int j = UTF32.char32At(s, 0);
|
||||
|
@ -1031,16 +1044,16 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
check(i, ucd.getBidiClass(i), ucd.getBidiClass(j), UCD_Names.BC, "BidiClass");
|
||||
check(i, ucd.getNumericValue(i), ucd.getNumericValue(j), "NumericValue");
|
||||
check(i, ucd.getNumericType(i), ucd.getNumericType(j), UCD_Names.NT, "NumericType");
|
||||
|
||||
|
||||
if (false) {
|
||||
for (byte k = LOWER; k <= FOLD; ++k) {
|
||||
check(i, ucd.getCase(i, SIMPLE, k), ucd.getCase(j, SIMPLE, k), "Simple("+k+")");
|
||||
check(i, ucd.getCase(i, FULL, k), ucd.getCase(j, FULL, k), "Full("+k+")");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (slen == 1) check(i, ucd.getSpecialCase(i), ucd.getSpecialCase(j), "SpecialCase");
|
||||
|
||||
|
||||
for (byte k = 0; k < LIMIT_BINARY_PROPERTIES; ++k) {
|
||||
if (k == Hex_Digit) continue;
|
||||
if (k == Radical) continue;
|
||||
|
@ -1052,12 +1065,12 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
//check(i, ucd.getLineBreak(i), ucd.getLineBreak(j), UCD_Names.LB, "LineBreak");
|
||||
//check(i, ucd.getEastAsianWidth(i), ucd.getEastAsianWidth(j), UCD_Names.EA, "EastAsian");
|
||||
}
|
||||
|
||||
|
||||
} catch (Exception e) {
|
||||
System.out.println("Error: " + Utility.hex(i) + " " + e.getClass().getName() + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java,v $
|
||||
* $Date: 2001/08/31 00:29:50 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
|
@ -7,14 +20,14 @@ import com.ibm.text.utility.*;
|
|||
|
||||
public class WriteJavaScriptInfo {
|
||||
/* TODO: fix enumeration of compositions
|
||||
|
||||
|
||||
static public void writeJavascriptInfo() throws IOException {
|
||||
System.err.println("Writing Javascript data");
|
||||
UCD ucd = UCD.make();
|
||||
Normalizer normKD = new Normalizer(Normalizer.NFKD);
|
||||
Normalizer normD = new Normalizer(Normalizer.NFD);
|
||||
PrintWriter log = new PrintWriter(new FileOutputStream("Normalization_data.js"));
|
||||
|
||||
|
||||
int count = 0;
|
||||
int datasize = 0;
|
||||
int max = 0;
|
||||
|
@ -22,7 +35,7 @@ public class WriteJavaScriptInfo {
|
|||
log.println("var KD = new Object(); // NFKD compatibility decomposition mappings");
|
||||
log.println("// NOTE: Hangul is done in code!");
|
||||
CompactShortArray csa = new CompactShortArray((short)0);
|
||||
|
||||
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
|
||||
if (0xAC00 <= c && c <= 0xD7A3) continue;
|
||||
|
@ -50,7 +63,7 @@ public class WriteJavaScriptInfo {
|
|||
log.println("var D = new Object(); // NFD canonical decomposition mappings");
|
||||
log.println("// NOTE: Hangul is done in code!");
|
||||
csa = new CompactShortArray((short)0);
|
||||
|
||||
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
|
||||
if (0xAC00 <= c && c <= 0xD7A3) continue;
|
||||
|
@ -64,7 +77,7 @@ public class WriteJavaScriptInfo {
|
|||
}
|
||||
}
|
||||
csa.compact();
|
||||
|
||||
|
||||
log.println("// " + count + " NFD mappings total");
|
||||
log.println("// " + datasize + " total characters of results");
|
||||
log.println("// " + max + " string length, maximum");
|
||||
|
@ -75,13 +88,13 @@ public class WriteJavaScriptInfo {
|
|||
datasize = 0;
|
||||
log.println("var CC = new Object(); // canonical class mappings");
|
||||
CompactByteArray cba = new CompactByteArray();
|
||||
|
||||
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
|
||||
int canClass = normKD.getCanonicalClass(c);
|
||||
if (canClass != 0) {
|
||||
++count;
|
||||
|
||||
|
||||
log.println("\t CC[0x" + Utility.hex(c) + "]=" + canClass + ";");
|
||||
}
|
||||
}
|
||||
|
@ -89,7 +102,7 @@ public class WriteJavaScriptInfo {
|
|||
log.println("// " + count + " canonical class mappings total");
|
||||
log.println("// " + cba.storage() + " trie length");
|
||||
log.println();
|
||||
|
||||
|
||||
count = 0;
|
||||
datasize = 0;
|
||||
log.println("var C = new Object(); // composition mappings");
|
||||
|
@ -105,11 +118,11 @@ public class WriteJavaScriptInfo {
|
|||
}
|
||||
log.println("// " + count + " composition mappings total");
|
||||
log.println();
|
||||
|
||||
|
||||
log.close();
|
||||
System.err.println("Done writing Javascript data");
|
||||
}
|
||||
|
||||
|
||||
*/
|
||||
|
||||
|
||||
}
|
Loading…
Add table
Reference in a new issue