changed parameterization, fixed gaps, produced raw version, everything now in one file.

X-SVN-Rev: 14324
This commit is contained in:
Mark Davis 2004-01-15 01:08:30 +00:00
parent fcf9869ec8
commit 39e1d2518d
3 changed files with 246 additions and 161 deletions

View file

@ -1,5 +1,6 @@
package com.ibm.text.UCA;
import com.ibm.text.UCD.UCD_Types;
import com.ibm.text.utility.Utility;
/**
@ -16,7 +17,7 @@ import com.ibm.text.utility.Utility;
# Last CJK_A: E0DE3100
*/
public class Implicit {
public class Implicit implements UCD_Types {
/**
* constants
@ -27,7 +28,7 @@ public class Implicit {
static final long bottomByte = 0xFFL;
static final long fourBytes = 0xFFFFFFFFL;
static final int MAX_INPUT = 0x21FFFF;
static final int MAX_INPUT = 0x220000; // 2 * Unicode range + 1
/**
* Testing function
@ -37,6 +38,10 @@ public class Implicit {
System.out.println("Start");
try {
Implicit foo = new Implicit(0xE0, 0xE4);
//int x = foo.getRawImplicit(0xF810);
foo.getFromRawImplicit(0xE20303E7);
int gap4 = foo.getGap4();
int gap3 = foo.getGap3();
int minTrail = foo.getMinTrail();
@ -44,13 +49,30 @@ public class Implicit {
long last = 0;
long current;
for (int i = 0; i <= MAX_INPUT; ++i) {
current = foo.getImplicit(i) & fourBytes;
current = foo.getRawImplicit(i) & fourBytes;
// check that it round-trips AND that all intervening ones are illegal
int roundtrip = foo.getFromRawImplicit((int)current);
if (roundtrip != i) {
foo.throwError("No roundtrip", i);
}
if (last != 0) {
for (long j = last + 1; j < current; ++j) {
roundtrip = foo.getFromRawImplicit((int)j);
// raise an error if it *doesn't* find an error
if (roundtrip != -1) {
foo.throwError("Fails to recognize illegal", j);
}
}
}
// now do other consistency checks
long lastBottom = last & bottomByte;
long currentBottom = current & bottomByte;
long lastTop = last & topByte;
long currentTop = current & topByte;
// do some consistency checks
/*
long gap = current - last;
if (currentBottom != 0) { // if we are a 4-byte
// gap has to be at least gap4
@ -65,6 +87,7 @@ public class Implicit {
if (current3Bottom < minTrail + gap3) foo.throwError("Failed gap3 before", i);
if (current3Bottom > maxTrail - gap3) foo.throwError("Failed gap3 after", i);
}
*/
// print out some values for spot-checking
if (lastTop != currentTop || i == 0x10000 || i == 0x110000) {
foo.show(i-3);
@ -94,13 +117,17 @@ public class Implicit {
}
}
private void throwError(String title, int i) {
throw new IllegalArgumentException(title + "\t" + Utility.hex(i) + "\t" + Utility.hex(getImplicit(i) & fourBytes));
private void throwError(String title, int cp) {
throw new IllegalArgumentException(title + "\t" + Utility.hex(cp) + "\t" + Utility.hex(getRawImplicit(cp) & fourBytes));
}
private void throwError(String title, long ce) {
throw new IllegalArgumentException(title + "\t" + Utility.hex(ce & fourBytes));
}
private void show(int i) {
if (i >= 0 && i <= MAX_INPUT) {
System.out.println(Utility.hex(i) + "\t" + Utility.hex(getImplicit(i) & fourBytes));
System.out.println(Utility.hex(i) + "\t" + Utility.hex(getRawImplicit(i) & fourBytes));
}
}
@ -117,6 +144,8 @@ public class Implicit {
int max4Primary;
int minTrail;
int maxTrail;
int max3Trail;
int max4Trail;
int min4Boundary;
public int getGap4() {
@ -140,7 +169,7 @@ public class Implicit {
*/
public Implicit(int minPrimary, int maxPrimary) {
// 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
this(minPrimary, maxPrimary, 0x04, 0xFE, 1, 15);
this(minPrimary, maxPrimary, 0x03, 0xFE, 1, 1);
}
/**
@ -152,54 +181,54 @@ public class Implicit {
* @param gap3 the gap we leave for tailoring for 3-byte forms
* @param gap4 the gap we leave for tailoring for 4-byte forms
*/
public Implicit(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int gap4) {
public Implicit(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int primaries3count) {
// some simple parameter checks
if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) throw new IllegalArgumentException("bad lead bytes");
if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) throw new IllegalArgumentException("bad trail bytes");
if (gap3 < 1 || gap4 < 1) throw new IllegalArgumentException("must have larger gaps");
if (primaries3count < 1) throw new IllegalArgumentException("bad gap");
this.minTrail = minTrail;
this.maxTrail = maxTrail;
final3Multiplier = gap3 + 1;
final4Multiplier = gap4 + 1;
min3Primary = minPrimary;
max4Primary = maxPrimary;
// compute constants for use later.
// number of values we can use in trailing bytes
// leave room for empty values below, between, AND above, so
// gap = 2:
// range 3..7 => (3,4) 5 (6,7): so 1 value
// range 3..8 => (3,4) 5 (6,7,8): so 1 value
// range 3..9 => (3,4) 5 (6,7,8,9): so 1 value
// range 3..10 => (3,4) 5 (6,7) 8 (9, 10): so 2 values
final3Count = 1 + (maxTrail - minTrail - 1) / final3Multiplier;
final4Count = 1 + (maxTrail - minTrail - 1) / final4Multiplier;
// leave room for empty values between AND above, e.g. if gap = 2
// range 3..7 => +3 -4 -5 -6 -7: so 1 value
// range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
// range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
final3Multiplier = gap3 + 1;
final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
// medials can use full range
medialCount = (maxTrail - minTrail + 1);
// find out how many values fit in each form
int fourByteCount = medialCount * medialCount * final4Count;
int threeByteCount = medialCount * final3Count;
// now determine where the 3/4 boundary is.
// we use 3 bytes below the boundary, and 4 above
int primariesAvailable = maxPrimary - minPrimary + 1;
int min4BytesNeeded = divideAndRoundUp(MAX_INPUT, fourByteCount);
int min3BytesNeeded = primariesAvailable - min4BytesNeeded;
if (min3BytesNeeded < 1) throw new IllegalArgumentException("Too few 3-byte implicits available.");
int min3ByteCoverage = min3BytesNeeded * threeByteCount;
min4Primary = minPrimary + min3BytesNeeded;
int primaries4count = primariesAvailable - primaries3count;
//int min3BytesNeeded = primariesAvailable - min4BytesNeeded;
int min3ByteCoverage = primaries3count * threeByteCount;
min4Primary = minPrimary + primaries3count;
min4Boundary = min3ByteCoverage;
// Now expand out the multiplier for the 4 bytes, and redo.
int totalNeeded = MAX_INPUT - min4Boundary;
int neededPerPrimaryByte = divideAndRoundUp(totalNeeded, min4BytesNeeded);
int neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
int neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
int expandedGap = (maxTrail - minTrail - 1) / (neededPerFinalByte + 1) - 1;
if (DEBUG) System.out.println("expandedGap: " + expandedGap);
if (expandedGap < gap4) throw new IllegalArgumentException("must have larger gaps");
final4Multiplier = expandedGap + 1;
int gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
if (DEBUG) System.out.println("expandedGap: " + gap4);
if (gap4 < 1) throw new IllegalArgumentException("must have larger gap4s");
final4Multiplier = gap4 + 1;
final4Count = neededPerFinalByte;
max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
if (DEBUG) {
System.out.println("final4Count: " + final4Count);
for (int counter = 0; counter <= final4Count; ++counter) {
@ -212,13 +241,58 @@ public class Implicit {
static public int divideAndRoundUp(int a, int b) {
return 1 + (a-1)/b;
}
/**
* Converts implicit CE into raw integer ("code point")
* @param implicit
* @return -1 if illegal format
*/
public int getFromRawImplicit(int implicit) {
int result;
int b3 = implicit & 0xFF;
implicit >>= 8;
int b2 = implicit & 0xFF;
implicit >>= 8;
int b1 = implicit & 0xFF;
implicit >>= 8;
int b0 = implicit & 0xFF;
// simple parameter checks
if (b0 < min3Primary || b0 > max4Primary
|| b1 < minTrail || b1 > maxTrail) return -1;
// normal offsets
b1 -= minTrail;
// take care of the final values, and compose
if (b0 < min4Primary) {
if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
b2 -= minTrail;
int remainder = b2 % final3Multiplier;
if (remainder != 0) return -1;
b0 -= min3Primary;
b2 /= final3Multiplier;
result = ((b0 * medialCount) + b1) * final3Count + b2;
} else {
if (b2 < minTrail || b2 > maxTrail
|| b3 < minTrail || b3 > max4Trail) return -1;
b2 -= minTrail;
b3 -= minTrail;
int remainder = b3 % final4Multiplier;
if (remainder != 0) return -1;
b3 /= final4Multiplier;
b0 -= min4Primary;
result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
}
// final check
if (result < 0 || result > MAX_INPUT) return -1;
return result;
}
/**
* Generate the implicit CE, left shifted to put the first byte at the top of an int.
* @param cp code point
* @return
*/
public int getImplicit(int cp) {
public int getRawImplicit(int cp) {
if (cp < 0 || cp > MAX_INPUT) {
throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
}
@ -230,7 +304,7 @@ public class Implicit {
int last2 = last1 / medialCount;
last1 %= medialCount;
last0 = minTrail + (last0 + 1)*final3Multiplier - 1; // spread out, leaving gap at start
last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
last1 = minTrail + last1; // offset
last2 = min3Primary + last2; // offset
@ -249,7 +323,7 @@ public class Implicit {
int last3 = last2 / medialCount;
last2 %= medialCount;
last0 = minTrail + (last0 + 1)*final4Multiplier - 1; // spread out, leaving gap at start
last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
last1 = minTrail + last1; // offset
last2 = minTrail + last2; // offset
last3 = min4Primary + last3; // offset
@ -261,6 +335,71 @@ public class Implicit {
return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
}
}
public int getSwappedImplicit(int cp) {
if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
cp = Implicit.swapCJK(cp);
// we now have a range of numbers from 0 to 21FFFF.
if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
return getRawImplicit(cp);
}
/**
* Function used to:
* a) collapse the 2 different Han ranges from UCA into one (in the right order), and
* b) bump any non-CJK characters by 10FFFF.
* The relevant blocks are:
* A: 4E00..9FFF; CJK Unified Ideographs
* F900..FAFF; CJK Compatibility Ideographs
* B: 3400..4DBF; CJK Unified Ideographs Extension A
* 20000..XX; CJK Unified Ideographs Extension B (and others later on)
* As long as
* no new B characters are allocated between 4E00 and FAFF, and
* no new A characters are outside of this range,
* (very high probability) this simple code will work.
* The reordered blocks are:
* Block1 is CJK
* Block2 is CJK_COMPAT_USED
* Block3 is CJK_A
* (all contiguous)
* Any other CJK gets its normal code point
* Any non-CJK gets +10FFFF
* When we reorder Block1, we make sure that it is at the very start,
* so that it will use a 3-byte form.
* Warning: the we only pick up the compatibility characters that are
* NOT decomposed, so that block is smaller!
*/
static int NON_CJK_OFFSET = 0x110000;
static int swapCJK(int i) {
if (i >= CJK_BASE) {
if (i < CJK_LIMIT) return i - CJK_BASE;
if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
+ (CJK_LIMIT - CJK_BASE);
if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
return i + NON_CJK_OFFSET; // non-CJK
}
if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_A_LIMIT) return i - CJK_A_BASE
+ (CJK_LIMIT - CJK_BASE)
+ (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
return i + NON_CJK_OFFSET; // non-CJK
}
/**
* @return
*/

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
* $Date: 2004/01/13 18:32:12 $
* $Revision: 1.17 $
* $Date: 2004/01/15 01:08:30 $
* $Revision: 1.18 $
*
*******************************************************************************
*/
@ -33,6 +33,7 @@ public class Main {
// A few changes would need to be made to the code to do older versions.
try {
System.out.println("Building UCA");
Default.setUCD(UCDVersion);
WriteCollationData.collator = new UCA(null, UCDVersion);
System.out.println("Built version " + WriteCollationData.collator.getDataVersion()
+ "/ucd: " + WriteCollationData.collator.getUCDVersion());

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
* $Date: 2004/01/13 18:32:11 $
* $Revision: 1.36 $
* $Date: 2004/01/15 01:08:30 $
* $Revision: 1.37 $
*
*******************************************************************************
*/
@ -109,15 +109,15 @@ public class WriteCollationData implements UCD_Types, UCA_Types {
static public void writeCaseExceptions() {
System.err.println("Writing Case Exceptions");
Normalizer NFKC = new Normalizer(Normalizer.NFKC, UNICODE_VERSION);
//Normalizer NFKC = new Normalizer(Normalizer.NFKC, UNICODE_VERSION);
for (char a = 0; a < 0xFFFF; ++a) {
if (!ucd.isRepresented(a)) continue;
//if (0xA000 <= a && a <= 0xA48F) continue; // skip YI
String b = Case.fold(a);
String c = NFKC.normalize(b);
String c = Default.nfkc.normalize(b);
String d = Case.fold(c);
String e = NFKC.normalize(d);
String e = Default.nfkc.normalize(d);
if (!e.equals(c)) {
System.out.println(Utility.hex(a) + "; " + Utility.hex(d, " ") + " # " + ucd.getName(a));
/*
@ -135,7 +135,7 @@ public class WriteCollationData implements UCD_Types, UCA_Types {
*/
}
String f = Case.fold(e);
String g = NFKC.normalize(f);
String g = Default.nfkc.normalize(f);
if (!f.equals(d) || !g.equals(e)) System.out.println("!!!!!!SKY IS FALLING!!!!!!");
}
}
@ -187,8 +187,8 @@ public class WriteCollationData implements UCD_Types, UCA_Types {
static public void writeJavascriptInfo() throws IOException {
System.err.println("Writing Javascript data");
Normalizer normKD = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
Normalizer normD = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
//Normalizer normKD = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
//Normalizer normD = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
//log = new PrintWriter(new FileOutputStream("Normalization_data.js"));
log = Utility.openPrintWriter(UCA_GEN_DIR, "Normalization_data.js", Utility.LATIN1_WINDOWS);
@ -204,9 +204,9 @@ public class WriteCollationData implements UCD_Types, UCA_Types {
for (char c = 0; c < 0xFFFF; ++c) {
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
if (0xAC00 <= c && c <= 0xD7A3) continue;
if (!normKD.isNormalized(c)) {
if (!Default.nfkd.isNormalized(c)) {
++count;
String decomp = normKD.normalize(c);
String decomp = Default.nfkd.normalize(c);
datasize += decomp.length();
if (max < decomp.length()) max = decomp.length();
if (decomp.length() > 7) ++over7;
@ -232,9 +232,9 @@ public class WriteCollationData implements UCD_Types, UCA_Types {
for (char c = 0; c < 0xFFFF; ++c) {
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
if (0xAC00 <= c && c <= 0xD7A3) continue;
if (!normD.isNormalized(c)) {
if (!Default.nfd.isNormalized(c)) {
++count;
String decomp = normD.normalize(c);
String decomp = Default.nfd.normalize(c);
datasize += decomp.length();
if (max < decomp.length()) max = decomp.length();
csa.setElementAt(c, (short)count);
@ -256,7 +256,7 @@ public class WriteCollationData implements UCD_Types, UCA_Types {
for (char c = 0; c < 0xFFFF; ++c) {
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
int canClass = normKD.getCanonicalClass(c);
int canClass = Default.nfkd.getCanonicalClass(c);
if (canClass != 0) {
++count;
@ -277,7 +277,7 @@ public class WriteCollationData implements UCD_Types, UCA_Types {
/*
IntHashtable.IntEnumeration enum = normKD.getComposition();
IntHashtable.IntEnumeration enum = Default.nfkd.getComposition();
while (enum.hasNext()) {
int key = enum.next();
char val = (char) enum.value();
@ -530,8 +530,8 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
}
int oldStrength = collator.getStrength();
collator.setStrength(strength);
Normalizer nfkd = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
Normalizer nfc = new Normalizer(Normalizer.NFC, UNICODE_VERSION);
//Normalizer nfkd = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
//Normalizer nfc = new Normalizer(Normalizer.NFC, UNICODE_VERSION);
switch (strength) {
case 1: log.println("<h2>3. Primaries Incompatible with Decompositions</h2>"); break;
case 2: log.println("<h2>4. Secondaries Incompatible with Decompositions</h2>"); break;
@ -550,12 +550,12 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
for (int ch = 0; ch < 0x10FFFF; ++ch) {
if (!ucd_uca_base.isAllocated(ch)) continue;
if (nfkd.isNormalized(ch)) continue;
if (Default.nfkd.isNormalized(ch)) continue;
if (ch > 0xAC00 && ch < 0xD7A3) continue; // skip most of Hangul
if (alreadySeen.contains(ch)) continue;
Utility.dot(ch);
String decomp = nfkd.normalize(ch);
String decomp = Default.nfkd.normalize(ch);
if (ch != ' ' && decomp.charAt(0) == ' ') {
skipSet.add(ch);
continue; // skip wierd decomps
@ -608,10 +608,10 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
}
static String remapSortKey(int cp, boolean decomposition) {
if (toD.isNormalized(cp)) return remapCanSortKey(cp, decomposition);
if (Default.nfd.isNormalized(cp)) return remapCanSortKey(cp, decomposition);
// we know that it is not NFKD.
String canDecomp = toD.normalize(cp);
String canDecomp = Default.nfd.normalize(cp);
String result = "";
int ch;
for (int j = 0; j < canDecomp.length(); j += UTF16.getCharCount(ch)) {
@ -799,9 +799,9 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
log.println("compressed: " + comp);
}
log.println("Ken's : " + kenStr);
String nfkd = NFKD.normalize(s);
String nfkd = Default.nfkd.normalize(s);
log.println("NFKD : " + ucd.getCodeAndName(nfkd));
String nfd = NFD.normalize(s);
String nfd = Default.nfd.normalize(s);
if (!nfd.equals(nfkd)) {
log.println("NFD : " + ucd.getCodeAndName(nfd));
}
@ -824,7 +824,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
static final byte getDecompType(int cp) {
byte result = ucd.getDecompositionType(cp);
if (result == ucd.CANONICAL) {
String d = NFD.normalize(cp); // TODO
String d = Default.nfd.normalize(cp); // TODO
int cp1;
for (int i = 0; i < d.length(); i += UTF16.getCharCount(cp1)) {
cp1 = UTF16.charAt(d, i);
@ -887,7 +887,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
byte type = getDecompType(UTF16.charAt(s, 0));
char ch = s.charAt(0);
String decomp = NFKD.normalize(s);
String decomp = Default.nfkd.normalize(s);
int len = 0;
int markLen = collator.getCEs(decomp, true, markCes);
if (compress) markLen = kenCompress(markCes, markLen);
@ -957,7 +957,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
log.println("<h2>8. Checking against categories</h2>");
log.println("<p>These are not necessarily errors, but should be examined for <i>possible</i> errors</p>");
log.println("<table border='1' cellspacing='0' cellpadding='2'>");
Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
//Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
Set sorted = new TreeSet();
@ -994,14 +994,14 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
log.println("<p>These are not necessarily errors, but should be examined for <i>possible</i> errors</p>");
log.println("<table border='1' cellspacing='0' cellpadding='2'>");
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, toD);
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, Default.nfd);
Map map = new TreeMap();
while (true) {
String s = cc.next();
if (s == null) break;
if (!toD.isNormalized(s)) continue; // only unnormalized stuff
if (!Default.nfd.isNormalized(s)) continue; // only unnormalized stuff
if (UTF16.countCodePoint(s) == 1) {
int cat = ucd.getCategory(UTF16.charAt(s,0));
if (cat == Cn || cat == Cc || cat == Cs) continue;
@ -1033,7 +1033,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
log.println("<p>These are not necessarily errors, but should be examined for <i>possible</i> errors</p>");
log.println("<table border='1' cellspacing='0' cellpadding='2'>");
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, toD);
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, Default.nfd);
Map map = new TreeMap();
Map tails = new TreeMap();
@ -1045,7 +1045,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
String s = cc.next();
if (s == null) break;
Utility.dot(counter++);
if (!toD.isNormalized(s)) continue; // only normalized stuff
if (!Default.nfd.isNormalized(s)) continue; // only normalized stuff
CEList celist = collator.getCEList(s, true);
map.put(celist, s);
}
@ -1212,11 +1212,11 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
diLog.write('\uFEFF');
Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
//Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
int[] ces = new int[50];
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, Default.nfd);
int[] lenArray = new int[1];
diLog.println("# Contractions");
@ -1261,7 +1261,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
//diLog = new PrintWriter(new FileOutputStream(UCA_GEN_DIR + "DisjointIgnorables.txt"));
Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
//Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
int[] ces = new int[50];
int[] secondariesZP = new int[400];
@ -1287,7 +1287,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
String s = String.valueOf(ch);
int len = collator.getCEs(s, true, ces);
*/
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, Default.nfd);
int[] lenArray = new int[1];
Set sortedCodes = new TreeSet();
@ -1432,7 +1432,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
//diLog = new PrintWriter(new FileOutputStream(UCA_GEN_DIR + "DisjointIgnorables.txt"));
Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
//Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
int[] ces = new int[50];
int[] secondariesZP = new int[400];
@ -1458,7 +1458,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
String s = String.valueOf(ch);
int len = collator.getCEs(s, true, ces);
*/
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, Default.nfd);
int[] lenArray = new int[1];
Set sortedCodes = new TreeSet();
@ -1628,9 +1628,9 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
}
}
static Normalizer nfdNew = new Normalizer(Normalizer.NFD, "");
static Normalizer NFC = new Normalizer(Normalizer.NFC, "");
static Normalizer nfkdNew = new Normalizer(Normalizer.NFKD, "");
//static Normalizer nfdNew = new Normalizer(Normalizer.NFD, "");
//static Normalizer NFC = new Normalizer(Normalizer.NFC, "");
//static Normalizer nfkdNew = new Normalizer(Normalizer.NFKD, "");
static int getFirstCELen(int[] ces, int len) {
if (len < 2) return len;
@ -1653,8 +1653,8 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
//if (true) return;
int[] ces = new int[50];
Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
Normalizer nfkd = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
//Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
//Normalizer nfkd = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
if (false) {
int len2 = collator.getCEs("\u2474", true, ces);
@ -1671,7 +1671,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
Map ordered = new TreeMap(cm);
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE,
SKIP_CANONICAL_DECOMPOSIBLES ? nfd : null);
SKIP_CANONICAL_DECOMPOSIBLES ? Default.nfd : null);
int[] lenArray = new int[1];
Set alreadyDone = new HashSet();
@ -1737,7 +1737,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
UnicodeSet composites = new UnicodeSet();
for (int i = 0; i < 0x10FFFF; ++i) {
if (!ucd.isAllocated(i)) continue;
if (nfd.isNormalized(i)) continue;
if (Default.nfd.isNormalized(i)) continue;
composites.add(i);
}
UnicodeSet CJKcomposites = new UnicodeSet(CJK).retainAll(composites);
@ -1774,9 +1774,9 @@ F900..FAFF; CJK Compatibility Ideographs
System.out.println("Adding Kanji");
for (int i = 0; i < 0x10FFFF; ++i) {
if (!ucd.isAllocated(i)) continue;
if (nfkd.isNormalized(i)) continue;
if (Default.nfkd.isNormalized(i)) continue;
Utility.dot(i);
String decomp = nfkd.normalize(i);
String decomp = Default.nfkd.normalize(i);
int cp;
for (int j = 0; j < decomp.length(); j += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(decomp, j);
@ -2438,7 +2438,7 @@ F900..FAFF; CJK Compatibility Ideographs
System.out.println("Fix Homeless! No back map for " + CEList.toString(ces[i])
+ " from " + CEList.toString(ces, len));
System.out.println("\t" + ucd.getCodeAndName(chr)
+ " => " + ucd.getCodeAndName(nfkdNew.normalize(chr))
+ " => " + ucd.getCodeAndName(Default.nfkd.normalize(chr))
);
s = "[" + Utility.hex(ces[i]) + "]";
} while (false); // exactly one time, just for breaking
@ -2528,7 +2528,7 @@ F900..FAFF; CJK Compatibility Ideographs
"[[:whitespace:][:c:][:z:][[:ascii:]-[a-zA-Z0-9]]]");
// needsQuoting.remove();
}
s = NFC.normalize(s);
s = Default.nfc.normalize(s);
quoteOperandBuffer.setLength(0);
boolean noQuotes = true;
boolean inQuote = false;
@ -2618,8 +2618,8 @@ F900..FAFF; CJK Compatibility Ideographs
|| primary > oldJamo5 && primary <= oldJamo6;
}
static Normalizer NFKD = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
static Normalizer NFD = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
//static Normalizer NFKD = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
//static Normalizer NFD = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
static int variableHigh = 0;
static final int COMMON = 5;
@ -2760,7 +2760,7 @@ F900..FAFF; CJK Compatibility Ideographs
for (int i = 0; i < 0x10FFFF; ++i) {
if (!ucd.isNoncharacter(i)) {
if (!ucd.isAllocated(i)) continue;
if (NFD.isNormalized(i)) continue;
if (Default.nfd.isNormalized(i)) continue;
if (ucd.isHangulSyllable(i)) continue;
//if (collator.getCEType(i) >= UCA.FIXED_CE) continue;
}
@ -2795,7 +2795,7 @@ F900..FAFF; CJK Compatibility Ideographs
// Skip anything that is not FCD.
if (!NFD.isFCD(s)) continue;
if (!Default.nfd.isFCD(s)) continue;
// We ONLY add if the sort key would be different
// Than what we would get if we didn't decompose!!
@ -3462,59 +3462,13 @@ F900..FAFF; CJK Compatibility Ideographs
}
*/
/**
* Function used to:
* a) collapse the 2 different Han ranges from UCA into one (in the right order), and
* b) bump any non-CJK characters by 10FFFF.
* The relevant blocks are:
* A: 4E00..9FFF; CJK Unified Ideographs
* F900..FAFF; CJK Compatibility Ideographs
* B: 3400..4DBF; CJK Unified Ideographs Extension A
* 20000..XX; CJK Unified Ideographs Extension B (and others later on)
* As long as
* no new B characters are allocated between 4E00 and FAFF, and
* no new A characters are outside of this range,
* (very high probability) this simple code will work.
* The reordered blocks are:
* Block1 is CJK
* Block2 is CJK_COMPAT_USED
* Block3 is CJK_A
* Any other CJK gets its normal code point
* Any non-CJK gets +10FFFF
* When we reorder Block1, we make sure that it is at the very start,
* so that it will use a 3-byte form.
*/
static int swapCJK(int i) {
if (i >= CJK_BASE) {
if (i < CJK_LIMIT) return i - CJK_BASE;
if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
+ (CJK_LIMIT - CJK_BASE);
if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
return i + NON_CJK_OFFSET; // non-CJK
}
if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_A_LIMIT) return i - CJK_A_BASE
+ (CJK_LIMIT - CJK_BASE)
+ (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
return i + NON_CJK_OFFSET; // non-CJK
}
// Fractional UCA Generation Constants
static final int
TOP = 0xA0,
SPECIAL_BASE = 0xF0,
NON_CJK_OFFSET = 0x110000,
BYTES_TO_AVOID = 3,
BYTES_TO_AVOID = 3,
OTHER_COUNT = 256 - BYTES_TO_AVOID,
LAST_COUNT = OTHER_COUNT / 2,
LAST_COUNT2 = OTHER_COUNT / 21, // room for intervening, without expanding to 5 bytes
@ -3533,23 +3487,14 @@ static int swapCJK(int i) {
// GET IMPLICIT PRIMARY WEIGHTS
// Return value is left justified primary key
static int getImplicitPrimary(int cp) {
if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
cp = swapCJK(cp);
if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
// we now have a range of numbers from 0 to 21FFFF.
return getImplicitPrimaryFromSwapped(cp);
}
static Implicit implicit = new Implicit(IMPLICIT_BASE_BYTE, IMPLICIT_MAX_BYTE);
static int getImplicitPrimary(int cp) {
return implicit.getSwappedImplicit(cp);
}
static int getImplicitPrimaryFromSwapped(int cp) {
return implicit.getImplicit(cp);
return implicit.getRawImplicit(cp);
}
@ -3563,7 +3508,7 @@ static int swapCJK(int i) {
static void showImplicit2(String title, int cp) {
System.out.println(title + ":\t" + Utility.hex(cp)
+ " => " + Utility.hex(swapCJK(cp))
+ " => " + Utility.hex(Implicit.swapCJK(cp))
+ " => " + Utility.hex(INT_MASK & getImplicitPrimary(cp)));
}
@ -3627,7 +3572,7 @@ static int swapCJK(int i) {
// test swapping
int currSwap = swapCJK(i);
int currSwap = Implicit.swapCJK(i);
if (currSwap < oldSwap) {
throw new IllegalArgumentException(Utility.hex(i) + ": overlap: "
+ Utility.hex(oldChar) + " (" + Utility.hex(oldSwap) + ")"
@ -3686,7 +3631,7 @@ static int swapCJK(int i) {
// b. toSmallKana(NFKD(x)) != x.
static final boolean needsCaseBit(String x) {
String s = NFKD.normalize(x);
String s = Default.nfkd.normalize(x);
if (!ucd.getCase(s, FULL, LOWER).equals(s)) return true;
if (!toSmallKana(s).equals(s)) return true;
return false;
@ -4175,7 +4120,7 @@ static int swapCJK(int i) {
continue;
}
canIt.setSource(key);
String nfdKey = toD.normalize(key);
String nfdKey = Default.nfd.normalize(key);
boolean first = true;
while (true) {
@ -4187,7 +4132,7 @@ static int swapCJK(int i) {
// Skip anything that is not FCD.
if (!NFD.isFCD(s)) continue;
if (!Default.nfd.isFCD(s)) continue;
// We ONLY add if the sort key would be different
// Than what we would get if we didn't decompose!!
@ -4235,11 +4180,11 @@ static int swapCJK(int i) {
errorCount++;
}
Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
//Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
int[] ces = new int[50];
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, Default.nfd);
int[] lenArray = new int[1];
int minps = Integer.MAX_VALUE;
@ -4275,7 +4220,7 @@ static int swapCJK(int i) {
}
}
cc = collator.getContents(UCA.FIXED_CE, nfd);
cc = collator.getContents(UCA.FIXED_CE, Default.nfd);
log.println("<table border='1' cellspacing='0' cellpadding='2'>");
int lastPrimary = 0;
@ -4410,8 +4355,8 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
static final char MARK2 = '\u0002';
//Normalizer normalizer = new Normalizer(Normalizer.NFC, true);
static Normalizer toC = new Normalizer(Normalizer.NFC, UNICODE_VERSION);
static Normalizer toD = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
//static Normalizer toC = new Normalizer(Normalizer.NFC, UNICODE_VERSION);
//static Normalizer toD = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
static TreeMap MismatchedC = new TreeMap();
static TreeMap MismatchedN = new TreeMap();
static TreeMap MismatchedD = new TreeMap();
@ -4425,7 +4370,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
static void addString(String ch, byte option) {
String colDbase = collator.getSortKey(ch, option, true);
String colNbase = collator.getSortKey(ch, option, false);
String colCbase = collator.getSortKey(toC.normalize(ch), option, false);
String colCbase = collator.getSortKey(Default.nfc.normalize(ch), option, false);
if (!colNbase.equals(colCbase) || !colNbase.equals(colDbase) ) {
/*System.out.println(Utility.hex(ch));
System.out.println(printableKey(colNbase));
@ -4595,7 +4540,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
}
static void showLine(int count, String ch, String keyD, String keyN) {
String decomp = toD.normalize(ch);
String decomp = Default.nfd.normalize(ch);
if (decomp.equals(ch)) decomp = ""; else decomp = "<br><" + Utility.hex(decomp, " ") + "> ";
log.println("<tr><td>" + count + "</td><td>"
+ Utility.hex(ch, " ")
@ -4631,8 +4576,8 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
String MN = (String)MismatchedN.get(ch);
String MC = (String)MismatchedC.get(ch);
String MD = (String)MismatchedD.get(ch);
String chInC = toC.normalize(ch);
String chInD = toD.normalize(ch);
String chInC = Default.nfc.normalize(ch);
String chInD = Default.nfd.normalize(ch);
log.println("<tr><td rowSpan='3' class='bottom'>" + Utility.replace(ucd.getName(ch), ", ", ",<br>")
+ "</td><td>NFD</td><td>" + Utility.hex(chInD)
@ -4665,7 +4610,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
static void showDiff(boolean showName, boolean firstColumn, int line, Object chobj) {
String ch = chobj.toString();
String decomp = toD.normalize(ch);
String decomp = Default.nfd.normalize(ch);
if (showName) {
if (ch.equals(decomp)) {
log.println(//title + counter + " "