finally did some significant code cleanup on collation. not enough, but it's a start

X-SVN-Rev: 8896
This commit is contained in:
Mark Davis 2002-06-15 02:47:14 +00:00
parent 05d2989deb
commit 3940ed8c00
11 changed files with 522 additions and 627 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/CEList.java,v $
* $Date: 2002/05/31 01:41:03 $
* $Revision: 1.4 $
* $Date: 2002/06/15 02:47:12 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -165,6 +165,15 @@ public final class CEList implements java.lang.Comparable, UCD_Types {
return result.toString();
}
public static String toString(IntStack ces) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < ces.length(); ++i) {
if (i != 0) result.append(' ');
result.append(toString(ces.get(i)));
}
return result.toString();
}
public static String toString(int ce) {
return "[" + Utility.hex(UCA.getPrimary(ce)) + "."
+ Utility.hex(UCA.getSecondary(ce)) + "."

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
* $Date: 2002/06/04 01:59:01 $
* $Revision: 1.5 $
* $Date: 2002/06/15 02:47:12 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
@ -18,7 +18,8 @@ import com.ibm.text.utility.*;
public class Main {
static final String UCDVersion = "";
static final String[] ICU_FILES = {"writeCollationValidityLog", "FractionalUCA", "writeconformance", "writeconformanceshifted",
static final String[] ICU_FILES = {"writeCollationValidityLog", "FractionalUCA",
"writeconformance", "writeconformanceshifted",
"WriteRules", "WriteRulesWithNames", "WriteRulesXML"};
public static void main(String args[]) throws Exception {

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
* $Date: 2002/06/04 01:58:56 $
* $Revision: 1.13 $
* $Date: 2002/06/15 02:47:12 $
* $Revision: 1.14 $
*
*******************************************************************************
*/
@ -24,6 +24,7 @@ import com.ibm.text.UCD.Normalizer;
import com.ibm.text.UCD.UCD;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
//import com.ibm.text.CollationData.*;
@ -62,7 +63,7 @@ This is because of shared
characters between scripts with different directions, like French with Arabic or Greek.
*/
final public class UCA implements Comparator {
final public class UCA implements Comparator, UCA_Types {
public static final String copyright =
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
@ -85,19 +86,13 @@ final public class UCA implements Comparator {
// base directory will change depending on the installation
public static final String BASE_DIR = "c:\\DATA\\";
/** Enum for alternate handling */
public static final byte SHIFTED = 0, ZEROED = 1, NON_IGNORABLE = 2, SHIFTED_TRIMMED = 3, LAST = 3;
/**
* Used to terminate a list of CEs
*/
public static final int TERMINATOR = 0xFFFFFFFF; // CE that marks end of string
// =============================================================
// Test Settings
// =============================================================
static final boolean DEBUG = false;
static final boolean DEBUG_SHOW_LINE = false;
static final boolean SHOW_STATS = true;
static final boolean SHOW_CE = false;
@ -109,6 +104,7 @@ final public class UCA implements Comparator {
static final boolean RECORDING_CHARS = true;
private UCD ucd;
private UCA_Data ucaData;
// =============================================================
// Main Methods
@ -121,11 +117,7 @@ final public class UCA implements Comparator {
*/
public UCA(BufferedReader source, String unicodeVersion) throws java.io.IOException {
fullData = source == null;
// clear some tables
for (int i = 0; i < collationElements.length; ++i) {
collationElements[i] = UNSUPPORTED;
}
// load the normalizer
if (toD == null) {
toD = new Normalizer(Normalizer.NFD, unicodeVersion);
@ -134,6 +126,8 @@ final public class UCA implements Comparator {
ucd = UCD.make(unicodeVersion);
ucdVersion = ucd.getVersion();
ucaData = new UCA_Data(toD, ucd);
// either get the full sources, or just a demo set
if (fullData) {
for (int i = 0; i < KEYS.length; ++i) {
@ -234,7 +228,7 @@ final public class UCA implements Comparator {
}
if (SHOW_CE) {
if (debugList.length() != 0) debugList.append("/");
debugList.append(ceToString(ce));
debugList.append(CEList.toString(ce));
}
// add weights
@ -412,6 +406,35 @@ final public class UCA implements Comparator {
return target;
}
/**
* Returns a list of CEs for a unicode character at a position.
* @param sourceString string to make a sort key for.
* @param offset position in string
* @param decomposition true for UCA, false where the text is guaranteed to be
* normalization form C with no combining marks of class 0.
* @param output array for output. Must be large enough on entry. When done, is terminated with TERMINATOR.
*/
public void getCEs(String sourceString, boolean decomposition, IntStack output) {
decompositionBuffer.setLength(0);
if (decomposition) {
toD.normalize(sourceString, decompositionBuffer);
} else {
decompositionBuffer.append(sourceString);
}
rearrangeBuffer = EMPTY; // clear the rearrange buffer (thai)
index = 0;
// process CEs, building weight strings
while (true) {
//fixQuaternatiesPosition = quaternaries.length();
int ce = getCE();
if (ce == 0) continue;
if (ce == TERMINATOR) break;
output.push(ce);
}
}
/**
* Returns a list of CEs for a unicode character at a position.
* @param sourceString string to make a sort key for.
@ -477,14 +500,6 @@ final public class UCA implements Comparator {
return strength == 1 ? primarySet : strength == 2 ? secondarySet : tertiarySet;
}
/**
* CE Type
*/
static final byte NORMAL_CE = 0, CONTRACTING_CE = 1, EXPANDING_CE = 2,
CJK_CE = 3, CJK_AB_CE = 4, HANGUL_CE = 5, UNSUPPORTED_CE = 7,
FIXED_CE = 3;
// SURROGATE_CE = 6,
/**
* Returns the char associated with a FIXED value
*/
@ -497,28 +512,7 @@ final public class UCA implements Comparator {
* Return the type of the CE
*/
public byte getCEType(int ch) {
if (ch > 0xFFFF) ch = UTF16.getLeadSurrogate(ch); // first if expands
int ce = collationElements[ch];
if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return NORMAL_CE;
if (ce == UNSUPPORTED) {
// Special check for Han, Hangul
if (isHangul(ch)) return HANGUL_CE;
if (isCJK(ch)) return CJK_CE;
if (isCJK_AB(ch)) return CJK_AB_CE;
// special check for unsupported surrogate pair, 20 1/8 bits
//if (0xD800 <= ch && ch <= 0xDFFF) {
// return SURROGATE_CE;
//}
return UNSUPPORTED_CE;
}
if (ce == CONTRACTING) return CONTRACTING_CE;
return EXPANDING_CE;
return ucaData.getCEType(ch);
}
/**
@ -604,19 +598,11 @@ final public class UCA implements Comparator {
return result.toString();
}
/**
* Produces a human-readable string for a collation element
*/
static public String ceToString(int ce) {
return "[" + Utility.hex(getPrimary(ce)) + "."
+ Utility.hex(getSecondary(ce)) + "."
+ Utility.hex(getTertiary(ce)) + "]";
}
/**
* Produces a human-readable string for a collation element.
* value is terminated by -1!
*/
/*
static public String ceToString(int[] ces, int len) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < len; ++i) {
@ -624,11 +610,13 @@ final public class UCA implements Comparator {
}
return result.toString();
}
&/
/**
* Produces a human-readable string for a collation element.
* value is terminated by -1!
*/
/*
static public String ceToString(int[] ces) {
StringBuffer result = new StringBuffer();
for (int i = 0; ; ++i) {
@ -637,7 +625,7 @@ final public class UCA implements Comparator {
}
return result.toString();
}
*/
static boolean isImplicitLeadCE(int ce) {
return isImplicitLeadPrimary(getPrimary(ce));
@ -670,10 +658,10 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
* and to get the second part use (x & 0xFFFF)
*/
static void CodepointToImplicit(int cp, int[] output) {
void CodepointToImplicit(int cp, int[] output) {
int base = UNSUPPORTED_OTHER_BASE;
if (isCJK(cp)) base = UNSUPPORTED_CJK_BASE;
else if (isCJK_AB(cp)) base = UNSUPPORTED_CJK_AB_BASE;
if (ucd.isCJK_BASE(cp)) base = UNSUPPORTED_CJK_BASE;
else if (ucd.isCJK_AB(cp)) base = UNSUPPORTED_CJK_AB_BASE;
output[0] = base + (cp >>> 15);
output[1] = (cp & 0x7FFF) | 0x8000;
}
@ -768,6 +756,9 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
// Privates
// =============================================================
IntStack expandingStack = new IntStack(10);
/**
* Array used to reorder surrogates to top of 16-bit range, and others down.
* Adds 2000 to D800..DFFF, making them F800..FFFF
@ -847,77 +838,13 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
// Collation Element Memory Data Table Formats
// =============================================================
/**
* Used to composed Hangul and Han characters
*/
static final int NEUTRAL_SECONDARY = 0x20;
static final int NEUTRAL_TERTIARY = 0x02;
/**
* Temporary buffer used in getSortKey for the decomposed string
*/
private StringBuffer decompositionBuffer = new StringBuffer();
/**
* The collation element data is stored a couple of different structures.
* First is collationElements, which generally contains the 32-bit CE corresponding
* to the data. It is directly indexed by character code.<br>
* For brevity in the implementation, we just use a flat array.
* A real implementation would use a multi-stage table, as described in TUS Section 5.
* table of simple collation elements, indexed by char.<br>
* Exceptional cases: expanding, contracting, unsupported are handled as described below.
*/
private int[] collationElements = new int[65536];
/**
* A special bit combination in a CE is used to reserve exception cases. This has the effect
* of removing a small number of the primary key values out of the 65536 possible.
*/
private static final int EXCEPTION_CE_MASK = 0xF8000000;
/**
* Any unsupported characters (those not in the UCA data tables)
* are marked with a exception bit combination
* so that they can be treated specially.<br>
* There are at least 34 values, so that we can use a range for surrogates
* However, we do add to the first weight if we have surrogate pairs!
*/
private static final int UNSUPPORTED_CJK_BASE = 0xFF40;
private static final int UNSUPPORTED_CJK_AB_BASE = 0xFF80;
private static final int UNSUPPORTED_OTHER_BASE = 0xFFC0;
private static final int UNSUPPORTED_BASE = UNSUPPORTED_CJK_BASE;
private static final int UNSUPPORTED_LIMIT = UNSUPPORTED_OTHER_BASE + 0x40;
private static final int UNSUPPORTED = makeKey(UNSUPPORTED_BASE, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
// was 0xFFC20101;
/**
* Contracting characters are marked with a exception bit combination
* in the collationElement table.
* This means that they are the first character of a contraction, and need
* to be looked up (with following characters) in the contractingTable.<br>
* This isn't a MASK since there is exactly one value.
*/
private static final int CONTRACTING = 0xFA310000;
/**
* Expanding characters are marked with a exception bit combination
* in the collationElement table.
* This means that they map to more than one CE, which is looked up in
* the expansionTable by index. See EXCEPTION_INDEX_MASK
*/
private static final int EXPANDING_MASK = 0xFA300000; // marks expanding range start
/**
* This mask is used to get the index from an EXPANDING exception.
* The contracting characters can also make use of this in a future optimization.
*/
static final int EXCEPTION_INDEX_MASK = 0x0000FFFF;
/**
* We take advantage of the variables being in a closed range to save a bit per CE.
* The low and high values are initially set to be at the opposite ends of the range,
@ -931,27 +858,18 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
private int variableLowCE; // used for testing against
private int variableHighCE; // used for testing against
/**
* Although a single character can expand into multiple CEs, we don't want to burden
* the normal case with the storage. So, they get a special value in the collationElements
* array. This value has a distinct primary weight, followed by an index into a separate
* table called expandingTable. All of the CEs in that table, up to a TERMINATOR value
* will be used for the expansion. The implementation is as a stack; this just makes it
* easy to generate.
*/
private IntStack expandingTable = new IntStack(3600); // initial number is from compKeys
/**
* For now, this is just a simple mapping of strings to collation elements.
* The implementation depends on the contracting characters being "completed",
* so that it can be efficiently determined when to stop looking.
*/
private Hashtable contractingTable = new Hashtable();
/*
/**
* Special char value that means failed or terminated
*/
private static final char NOT_A_CHAR = '\uFFFF';
private void fixSurrogateContraction(char ch) {
//if (DEBUGCHAR) System.out.println(Utility.hex(ch) + ": " + line.substring(0, position[0]) + "|" + line.substring(position[0]));
if (ch == NOT_A_CHAR || !UTF16.isLeadSurrogate(ch)) return;
String chs = String.valueOf(ch);
Object probe = contractingTable.get(chs);
if (probe != null) return;
contractingTable.put(chs, new Integer(UNSUPPORTED));
}
*/
/**
* Marks whether we are using the full data set, or an abbreviated version for
@ -965,11 +883,6 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
// Made part of the object to avoid reallocating each time.
// =============================================================
/**
* Stack for expanding characters
*/
private IntStack expandingStack = new IntStack(100);
/**
* Temporary buffers used in getSortKey to store weights
* these are NOT strings of Unicode characters--they are
@ -990,8 +903,6 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
* Temporary with requested decomposition
*/
boolean storedDecomposition;
int hangulHackBottom;
int hangulHackTop;
/**
* Used for supporting Thai rearrangement
@ -1015,7 +926,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
* (normalized) character code.
*/
private int getCE() {
if (!expandingStack.isEmpty()) return expandingStack.pop();
if (!expandingStack.isEmpty()) return expandingStack.popFront();
char ch;
// Fetch next character. Handle rearrangement for Thai, etc.
@ -1037,190 +948,56 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
}
}
int ce = collationElements[ch];
// Hangul tailoring hack
//if (!storedDecomposition && hangulHackBottom <= ce && ce < hangulHackTop) return fixJamo(ch, ce); // hard coded fix!!
// if the CE is not exceptional (unsupported, contracting, expanding) we are done.
if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return ce;
if (ce == UNSUPPORTED) {
int bigChar = ch;
index = ucaData.get(ch, decompositionBuffer, index, expandingStack);
int ce = expandingStack.popFront(); // pop first (guaranteed to exist!)
if (ce == UNSUPPORTED_FLAG) {
return handleUnsupported(ch);
}
return ce;
}
private int handleUnsupported(char ch) {
int bigChar = ch;
// Special check for Hangul
if (isHangul(bigChar)) {
// MUST DECOMPOSE!!
hangulBuffer = new StringBuffer();
decomposeHangul(bigChar, hangulBuffer);
return getCE();
// RECURSIVE!!!
}
// Special check for Hangul
if (ucd.isHangulSyllable(bigChar)) {
// MUST DECOMPOSE!!
hangulBuffer = new StringBuffer();
decomposeHangul(bigChar, hangulBuffer);
return getCE();
// RECURSIVE!!!
}
// special check and fix for unsupported surrogate pair, 20 1/8 bits
if (0xD800 <= bigChar && bigChar <= 0xDFFF) {
// ignore unmatched surrogates (e.g. return zero)
if (bigChar >= 0xDC00 || index >= decompositionBuffer.length()) return 0; // unmatched
int ch2 = decompositionBuffer.charAt(index);
if (ch2 < 0xDC00 || 0xDFFF < ch2) return 0; // unmatched
index++; // skip next char
bigChar = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00); // extract value
}
if (ucd.isNoncharacter(bigChar)) { // illegal code value, ignore!!
return 0;
}
if (ucd.isNoncharacter(bigChar)) { // illegal code value, ignore!!
return 0;
}
// special check and fix for unsupported surrogate pair, 20 1/8 bits
if (0xD800 <= bigChar && bigChar <= 0xDFFF) {
// ignore unmatched surrogates (e.g. return zero)
if (bigChar >= 0xDC00 || index >= decompositionBuffer.length()) return 0; // unmatched
int ch2 = decompositionBuffer.charAt(index);
if (ch2 < 0xDC00 || 0xDFFF < ch2) return 0; // unmatched
index++; // skip next char
bigChar = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00); // extract value
}
// find the implicit values; returned in 0 and 1
int[] implicit = new int[2];
CodepointToImplicit(bigChar, implicit);
// find the implicit values; returned in 0 and 1
int[] implicit = new int[2];
CodepointToImplicit(bigChar, implicit);
// Now compose the two keys
// first push BBBB, which is #1
// Now compose the two keys
// push BBBB
expandingStack.push(makeKey(implicit[1], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY));
expandingStack.push(makeKey(implicit[1], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY));
// return AAAA
// now return AAAA, which is #0
return makeKey(implicit[0], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
return makeKey(implicit[0], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
}
if (ce == CONTRACTING) {
// Contracting is probably the most interesting (read "tricky") part
// of the algorithm.
// First get longest substring that is in the contracting table.
// For simplicity, we use a hash table for contracting.
// There are much better optimizations,
// but they take a more complicated build algorithm than we want to show here.
// NOTE: We are guaranteed that the character itself is in the contracting table because
// of the build process.
String probe = String.valueOf(ch);
Object value = contractingTable.get(probe);
if (value == null) throw new IllegalArgumentException("Missing value for " + Utility.hex(ch));
// We loop, trying to add successive characters to the longest substring.
while (index < decompositionBuffer.length()) {
char ch2 = decompositionBuffer.charAt(index);
// see whether the current string plus the next char are in
// the contracting table.
String newProbe = probe + ch2;
Object newValue = contractingTable.get(newProbe);
if (newValue == null) break; // stop if not in table.
// We succeeded--so update our new values, and set index
// and quaternary to indicate that we swallowed another character.
probe = newProbe;
value = newValue;
index++;
}
// Now, see if we can add any combining marks
short lastCan = 0;
for (int i = index; i < decompositionBuffer.length(); ++i) {
// We only take certain characters. They have to be accents,
// and they have to not be blocked.
// Unlike above, if we don't find a match (and it was an accent!)
// then we don't stop, we continue looping.
char ch2 = decompositionBuffer.charAt(i);
short can = toD.getCanonicalClass(ch2);
if (can == 0) break; // stop with any zero (non-accent)
if (can == lastCan) continue; // blocked if same class as last
lastCan = can; // remember for next time
// Now see if we can successfully add it onto our string
// and find it in the contracting table.
String newProbe = probe + ch2;
Object newValue = contractingTable.get(newProbe);
if (newValue == null) continue;
// We succeeded--so update our new values, remove the char, and update
// quaternary to indicate that we swallowed another character.
probe = newProbe;
value = newValue;
decompositionBuffer.setCharAt(i,'\u0000'); // zero char
}
// we are all done, and can extract the CE from the last value set.
ce = ((Integer)value).intValue();
// if the CE is not exceptional (unsupported expanding) we are done.
// BTW we will never have a contracting CE at this point.
if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return ce;
// otherwise fall through to expansion
}
// expanding, so copy list of items onto stack
int index = ce & EXCEPTION_INDEX_MASK; // get index
// copy onto stack from index until reach TERMINATOR
while (true) {
ce = expandingTable.get(index++);
if (ce == TERMINATOR) break;
expandingStack.push(ce);
}
return expandingStack.pop(); // pop last (guaranteed to exist!)
}
// Neither Mapped nor Composite CJK: [\u3400-\u4DB5\u4E00-\u9FA5\U00020000-\U0002A6D6]
public static boolean isCJK(int cp) {
return (CJK_BASE <= cp && cp < CJK_LIMIT
|| cp == 0xFA0E // compat characters that don't decompose.
|| cp == 0xFA0F
|| cp == 0xFA11
|| cp == 0xFA13
|| cp == 0xFA14
|| cp == 0xFA1F
|| cp == 0xFA21
|| cp == 0xFA23
|| cp == 0xFA24
|| cp == 0xFA27
|| cp == 0xFA28
|| cp == 0xFA29
|| cp == 0xFA2E
|| cp == 0xFA2F
);
}
public static final int
CJK_BASE = 0x4E00,
CJK_LIMIT = 0x9FFF+1,
CJK_COMPAT_USED_BASE = 0xFA0E,
CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
CJK_A_BASE = 0x3400,
CJK_A_LIMIT = 0x4DBF+1,
CJK_B_BASE = 0x20000,
CJK_B_LIMIT = 0x2A6DF+1;
public static final boolean isCJK_AB(int bigChar) {
return (CJK_A_BASE <= bigChar && bigChar < CJK_A_LIMIT
|| CJK_B_BASE <= bigChar && bigChar < CJK_B_LIMIT);
}
/*
2E80..2EFF; CJK Radicals Supplement
2F00..2FDF; Kangxi Radicals
3400..4DBF; CJK Unified Ideographs Extension A
4E00..9FFF; CJK Unified Ideographs
F900..FAFF; CJK Compatibility Ideographs
20000..2A6DF; CJK Unified Ideographs Extension B
2F800..2FA1F; CJK Compatibility Ideographs Supplement
Compat:
# F900..FA0D [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
# FA10 CJK COMPATIBILITY IDEOGRAPH-FA10
# FA12 CJK COMPATIBILITY IDEOGRAPH-FA12
# FA15..FA1E [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
# FA20 CJK COMPATIBILITY IDEOGRAPH-FA20
# FA22 CJK COMPATIBILITY IDEOGRAPH-FA22
# FA25..FA26 [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
# FA2A..FA2D [4] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA2D
# FA30..FA6A [59] CJK COMPATIBILITY IDEOGRAPH-FA30..CJK COMPATIBILITY IDEOGRAPH-FA6A
# 2F800..2FA1D [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
*/
private final boolean isHangul(int bigChar) {
return (0xAC00 <= bigChar && bigChar <= 0xD7A3);
}
/**
@ -1287,12 +1064,12 @@ Compat:
*/
private int count1 = 0, count2 = 0, count3 = 0, max2 = 0, max3 = 0;
private int oldKey1 = -1, oldKey2 = -1, oldKey3 = -1;
Map multiTable = new TreeMap();
BitSet found = new BitSet();
UnicodeSet found = new UnicodeSet();
public Hashtable getContracting() {
/*public Hashtable getContracting() {
return new Hashtable(multiTable);
}
*/
public UCAContents getContents(byte ceLimit, Normalizer skipDecomps) {
return new UCAContents(ceLimit, skipDecomps, ucdVersion);
@ -1317,6 +1094,16 @@ Compat:
this.ceLimit = ceLimit;
this.nfd = new Normalizer(Normalizer.NFD, unicodeVersion);
this.skipDecomps = skipDecomps;
// FIX SAMPLES
if (SAMPLE_RANGES[0][0] == 0) {
for (int i = 0; ; ++i) { // add first unallocated character
if (!ucd.isAssigned(i)) {
SAMPLE_RANGES[0][0] = i;
break;
}
}
}
}
/**
@ -1334,7 +1121,9 @@ Compat:
// normal case
while (current++ < 0x10FFFF) {
if (current == 0x406) {
System.out.println("DEBUG");
}
//char ch = (char)current;
byte type = getCEType(current);
if (type >= ceLimit || type == CONTRACTING_CE) continue;
@ -1349,15 +1138,18 @@ Compat:
}
// contractions
if (enum == null) enum = multiTable.keySet().iterator();
if (enum.hasNext()) {
if (enum == null) enum = ucaData.getContractions();
while (enum.hasNext()) {
result = (String)enum.next();
if (result.length() == 1 && UTF16.isLeadSurrogate(result.charAt(0))) {
//System.out.println("Skipping " + ucd.getCodeAndName(result));
continue; // try again
}
return result;
}
// extra samples
if (currentRange < SAMPLE_RANGES.length) {
System.out.println("*");
try {
result = UTF16.valueOf(itemInRange);
} catch (RuntimeException e) {
@ -1372,10 +1164,11 @@ Compat:
endOfRange = SAMPLE_RANGES[currentRange].length > 1
? SAMPLE_RANGES[currentRange][1]
: startOfRange;
skip = ((endOfRange - startOfRange) / 513);
//skip = ((endOfRange - startOfRange) / 3);
}
} else if (itemInRange > startOfRange + 9 && itemInRange < endOfRange - 9 - skip) {
itemInRange += skip;
} else if (itemInRange > startOfRange + 5 && itemInRange < endOfRange - 5 /* - skip*/) {
//itemInRange += skip;
itemInRange = endOfRange - 5;
}
}
@ -1410,14 +1203,16 @@ Compat:
}
static final int[][] SAMPLE_RANGES = {
{0x10000},
{0x10FFFF},
{0x0220},
{0}, // LEAVE EMPTY--Turns into first unassigned character
{0xFFF0},
{0xD800},
{0xDFFF},
{0xFFFE},
{0xFFFF},
{0x10000},
{0xC0000},
{0xD0000},
{0x10FFFF},
{0x10FFFE},
{0x10FFFF},
{0x3400, 0x4DB5},
@ -1426,7 +1221,7 @@ Compat:
{0xA000, 0xA48C},
{0xE000, 0xF8FF},
{0x20000, 0x2A6D6},
{0xE0000, 0xE00FF},
{0xE0000, 0xE007E},
{0xF0000, 0xF00FD},
{0xFFF00, 0xFFFFD},
{0x100000, 0x1000FD},
@ -1438,7 +1233,7 @@ Compat:
* Values will override any previous mappings.
*/
private void addCollationElements(BufferedReader in) throws java.io.IOException {
IntStack tempStack = new IntStack(100); // used for reversal
IntStack tempStack = new IntStack(100);
StringBuffer multiChars = new StringBuffer(); // used for contracting chars
String inputLine = "";
boolean[] wasImplicitLeadPrimary = new boolean[1];
@ -1448,6 +1243,10 @@ Compat:
if (inputLine == null) break; // means file is done
String line = cleanLine(inputLine); // remove comments, extra whitespace
if (line.length() == 0) continue; // skip empty lines
if (DEBUG_SHOW_LINE) {
System.out.println("Processing: " + inputLine);
}
position[0] = 0; // start at front of line
if (line.startsWith("@version")) {
@ -1464,29 +1263,21 @@ Compat:
}
// collect characters
char value = getChar(line, position);
fixSurrogateContraction(value);
char value2 = getChar(line, position);
multiChars.setLength(0); // clear buffer
if (value2 != NOT_A_CHAR) {
fixSurrogateContraction(value2);
multiChars.append(value); // append until we get terminator
char value = getChar(line, position);
multiChars.append(value);
//fixSurrogateContraction(value);
char value2 = getChar(line, position);
// append until we get terminator
while (value2 != NOT_A_CHAR) {
multiChars.append(value2);
while (true) {
value2 = getChar(line, position);
if (value2 == NOT_A_CHAR) break;
fixSurrogateContraction(value2);
multiChars.append(value2);
}
value2 = getChar(line, position);
}
if (RECORDING_CHARS) {
if (multiChars.length() > 1) {
multiTable.put(multiChars.toString(), "");
}
found.set(value);
for (int i = 1; i < multiChars.length(); ++i) {
found.set(multiChars.charAt(i));
}
found.addAll(multiChars.toString());
}
if (!fullData && RECORDING_DATA) {
if (value == 0 || value == '\t' || value == '\n' || value == '\r'
@ -1522,141 +1313,69 @@ Compat:
}
}
}
if (ce2 != TERMINATOR) { // have expanding character!
// put list into the expanding table
// use a temporary stack to get them in reverse order
tempStack.push(ce);
tempStack.push(ce2);
// set collationElement to exception value, plus index
ce = EXPANDING_MASK | expandingTable.getTop();
while (true) {
ce2 = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
if (ce2 == TERMINATOR) break;
tempStack.push(ce2);
}
// push onto expanding table, now in reverse order
while (!tempStack.isEmpty()) expandingTable.push(tempStack.pop());
expandingTable.push(TERMINATOR);
}
//if (value == 0xd801) System.out.print("DEBUG: " + line);
// assign CE(s) to char(s)
if (multiChars.length() > 0) {
contractingTable.put(multiChars.toString(), new Integer(ce));
if (collationElements[value] == UNSUPPORTED) {
collationElements[value] = CONTRACTING; // mark special
} else if (collationElements[value] != CONTRACTING) {
// move old value to contracting table!
contractingTable.put(String.valueOf(value), new Integer(collationElements[value]));
collationElements[value] = CONTRACTING; // signal we must look up in table
}
} else if (collationElements[value] == CONTRACTING) {
// must add old value to contracting table!
contractingTable.put(String.valueOf(value), new Integer(ce));
} else {
collationElements[value] = ce; // normal
}
//} catch (Exception e) {
// throw new IllegalArgumentException("Malformed line: " + inputLine + "\n "
// + e.getClass().getName() + ": " + e.getMessage());
tempStack.clear();
tempStack.push(ce);
while (ce2 != TERMINATOR) {
tempStack.push(ce2);
ce2 = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
if (ce2 == TERMINATOR) break;
}
ucaData.add(multiChars, tempStack);
} catch (RuntimeException e) {
System.out.println("Error on line: " + inputLine);
throw e;
}
}
private void fixSurrogateContraction(char ch) {
//if (DEBUGCHAR) System.out.println(Utility.hex(ch) + ": " + line.substring(0, position[0]) + "|" + line.substring(position[0]));
if (ch == NOT_A_CHAR || !UTF16.isLeadSurrogate(ch)) return;
String chs = String.valueOf(ch);
Object probe = contractingTable.get(chs);
if (probe != null) return;
contractingTable.put(chs, new Integer(0));
}
/*
private void concat(int[] ces1, int[] ces2) {
}
private void add(String source, int[] ces, int ceLen) {
int ce;
if (ceLen < 1) {
throw new IllegalArgumentException("CE too short: " + ceLen);
} else if (ceLen == 1) {
ce = ces[0];
} else {
ce = EXPANDING_MASK | expandingTable.getTop();
for (int i = 0; i < ceLen; ++i) {
expandingTable.push(ces[i]);
}
}
// assign CE(s) to char(s)
int value = source.charAt(0);
//if (value == 0x10000) System.out.print("DEBUG2: " + source);
if (source.length() > 0) {
contractingTable.put(source.toString(), new Integer(ce));
if (collationElements[value] == UNSUPPORTED) {
collationElements[value] = CONTRACTING; // mark special
} else if (collationElements[value] != CONTRACTING) {
// move old value to contracting table!
contractingTable.put(String.valueOf(value), new Integer(collationElements[value]));
collationElements[value] = CONTRACTING; // signal we must look up in table
}
} else if (collationElements[value] == CONTRACTING) {
// must add old value to contracting table!
contractingTable.put(source, new Integer(ce));
} else {
collationElements[source.charAt(0)] = ce; // normal
}
}
*/
/**
* Checks the internal tables corresponding to the UCA data.
*/
private void cleanup() {
// at this point, we have to guarantee that the contractingTable is CLOSED
// e.g. if a substring of length n is in the table, then the first n-1 characters
// are also!!
ucaData.checkConsistency();
Map missingStrings = new HashMap();
Map tempMap = new HashMap();
/*
0FB2 0F71 ; [.124E.0020.0002.0FB2][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER RA + TIBETAN VOWEL SIGN AA
0FB3 0F71 ; [.1250.0020.0002.0FB3][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER LA + TIBETAN VOWEL SIGN AA
int[] temp1 = int[20];
int[] temp2 = int[20];
int[] temp3 = int[20];
getCEs("\u0fb2", true, temp1);
getCEs("\u0fb3", true, temp2);
getCEs("\u0f71", true, temp3);
add("\u0FB2\u0F71", concat(temp1, temp3));
*/
Hashtable missingStrings = new Hashtable();
int[] temp1 = new int[20];
Enumeration enum = contractingTable.keys();
while (enum.hasMoreElements()) {
String sequence = (String)enum.nextElement();
Iterator enum = ucaData.getContractions();
while (enum.hasNext()) {
String sequence = (String)enum.next();
//System.out.println("Contraction: " + Utility.hex(sequence));
for (int i = sequence.length()-1; i > 0; --i) {
String shorter = sequence.substring(0,i);
Object probe = contractingTable.get(shorter);
if (probe == null) {
int len = getCEs(shorter, true, temp1);
if (false) System.out.println("WARNING: CLOSING: " + UCD.make().getCodeAndName(shorter) + " => " + ceToString(temp1, len));
add(shorter, temp1, len);
if (!ucaData.contractionTableContains(shorter)) {
IntStack tempStack = new IntStack(1);
getCEs(shorter, true, tempStack);
if (false) System.out.println("WARNING: CLOSING: " + ucd.getCodeAndName(shorter)
+ " => " + CEList.toString(tempStack));
tempMap.put(shorter, tempStack);
// missingStrings.put(shorter,"");
// collationElements[sequence.charAt(0)] = UNSUPPORTED; // nuke all bad values
}
}
}
enum = missingStrings.keys();
// now add them. We couldn't before because we were iterating over it.
enum = tempMap.keySet().iterator();
while (enum.hasNext()) {
String shorter = (String) enum.next();
IntStack tempStack = (IntStack) tempMap.get(shorter);
ucaData.add(shorter, tempStack);
}
enum = missingStrings.keySet().iterator();
if (missingStrings.size() != 0) {
/**
while (enum.hasMoreElements()) {
@ -1666,26 +1385,30 @@ Compat:
}
*/
String errorMessage = "";
while (enum.hasMoreElements()) {
String missing = (String)enum.nextElement();
while (enum.hasNext()) {
String missing = (String)enum.next();
if (errorMessage.length() != 0) errorMessage += ", ";
errorMessage += "\"" + missing + "\"";
}
throw new IllegalArgumentException("Contracting table not closed! Missing " + errorMessage);
}
//fixlater;
variableLowCE = variableLow << 16;
variableHighCE = (variableHigh << 16) | 0xFFFF; // turn on bottom bits
hangulHackBottom = collationElements[0x1100] & 0xFFFF0000; // remove secondaries & tertiaries
hangulHackTop = collationElements[0x11F9] | 0xFFFF; // bump up secondaries and tertiaries
if (SHOW_STATS) System.out.println("\tHangul Hack: " + Utility.hex(hangulHackBottom) + ", " + Utility.hex(hangulHackTop));
//int hangulHackBottom;
//int hangulHackTop;
//hangulHackBottom = collationElements[0x1100] & 0xFFFF0000; // remove secondaries & tertiaries
//hangulHackTop = collationElements[0x11F9] | 0xFFFF; // bump up secondaries and tertiaries
//if (SHOW_STATS) System.out.println("\tHangul Hack: " + Utility.hex(hangulHackBottom) + ", " + Utility.hex(hangulHackTop));
// show some statistics
if (SHOW_STATS) System.out.println("\tcount1: " + count1);
if (SHOW_STATS) System.out.println("\tcount2: " + max2);
if (SHOW_STATS) System.out.println("\tcount3: " + max3);
if (SHOW_STATS) System.out.println("\tcontractions: " + ucaData.getContractionCount());
if (SHOW_STATS) System.out.println("\tMIN1/MAX1: " + Utility.hex(MIN1) + "/" + Utility.hex(MAX1));
if (SHOW_STATS) System.out.println("\tMIN2/MAX2: " + Utility.hex(MIN2) + "/" + Utility.hex(MAX2));
@ -1912,7 +1635,7 @@ Compat:
/**
* Used for checking data file integrity
*/
private Hashtable uniqueTable = new Hashtable();
private Map uniqueTable = new HashMap();
/**
* Used for checking data file integrity

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
* $Date: 2002/06/13 21:14:05 $
* $Revision: 1.18 $
* $Date: 2002/06/15 02:47:12 $
* $Revision: 1.19 $
*
*******************************************************************************
*/
@ -31,9 +31,12 @@ import com.ibm.text.UCD.UCD_Types;
import com.ibm.text.utility.*;
import com.ibm.text.UCD.Normalizer;
public class WriteCollationData implements UCD_Types {
public class WriteCollationData implements UCD_Types, UCA_Types {
static final boolean DEBUG = false;
static final boolean DEBUG_SHOW_ITERATION = true;
public static final String copyright =
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
@ -289,7 +292,21 @@ public class WriteCollationData implements UCD_Types {
static void writeConformance(String filename, byte option, boolean shortPrint) throws IOException {
UCD ucd30 = UCD.make("3.0.0");
//UCD ucd30 = UCD.make("3.0.0");
/*
U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
=> U+00DC LATIN CAPITAL LETTER U WITH DIAERESIS, U+0304 COMBINING MACRON
*/
String[] testList = {"\uF934", "U", "U\u0308", "\u00DC", "\u00DC\u0304", "U\u0308\u0304"};
for (int jj = 0; jj < testList.length; ++jj) {
String t = testList[jj];
System.out.println(ucd.getCodeAndName(t));
String test = collator.getSortKey(t, UCA.NON_IGNORABLE);
System.out.println("Decomp: " + collator.toString(test));
test = collator.getSortKey(t, UCA.NON_IGNORABLE, false);
System.out.println("No Dec: " + collator.toString(test));
}
PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt", true, false);
if (!shortPrint) log.write('\uFEFF');
@ -297,9 +314,39 @@ public class WriteCollationData implements UCD_Types {
System.out.println("Sorting");
int counter = 0;
for (int i = 0; i <= 0x10FFFF; ++i) {
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null);
cc.enableSamples();
UnicodeSet found2 = new UnicodeSet();
while (true) {
String s = cc.next();
if (s == null) break;
found2.addAll(s);
if (DEBUG_SHOW_ITERATION) {
int cp = UTF16.charAt(s, 0);
if (cp == 0x220 || !ucd.isAssigned(cp) || ucd.isCJK_BASE(cp)) {
System.out.println(ucd.getCodeAndName(s));
}
}
Utility.dot(counter++);
if (!ucd.isRepresented(i)) continue;
addStringX(s, option);
// TODO: add other accents with Cyrillic
}
UnicodeSet found = collator.found;
if (!found2.containsAll(found2)) {
System.out.println("In both: " + new UnicodeSet(found).retainAll(found2).toPattern(true));
System.out.println("In UCA but not iteration: " + new UnicodeSet(found).removeAll(found2).toPattern(true));
System.out.println("In iteration but not UCA: " + new UnicodeSet(found2).removeAll(found).toPattern(true));
throw new IllegalArgumentException("Inconsistent data");
}
/*
for (int i = 0; i <= 0x10FFFF; ++i) {
if (!ucd.isAssigned(i)) continue;
addStringX(UTF32.valueOf32(i), option);
}
@ -318,15 +365,6 @@ public class WriteCollationData implements UCD_Types {
addStringX(s, option);
}
for (int i = 0; ; ++i) { // add first unallocated character
if (!ucd.isAssigned(i)) {
String s = UTF32.valueOf32(i);
Utility.fixDot();
System.out.println("Adding: " + Utility.hex(s));
addStringX(s, option);
break;
}
}
for (int i = 0; i < extraConformanceRanges.length; ++i) {
@ -343,6 +381,7 @@ public class WriteCollationData implements UCD_Types {
addStringX(end-1, option);
addStringX(end, option);
}
*/
Utility.fixDot();
System.out.println("Total: " + sortedD.size());
@ -364,12 +403,12 @@ public class WriteCollationData implements UCD_Types {
//String status = key.equals(lastKey) ? "*" : "";
//lastKey = key;
//log.println(source);
char extra = source.charAt(source.length()-1);
String clipped = source.substring(0, source.length()-1);
String stren = source.substring(source.length()-1);
if (!shortPrint) {
log.print(Utility.hex(source));
log.print(
";\t#" + ucd.getName(clipped) + "\t" + UCA.toString(key));
";\t# " + (extra != LOW_ACCENT ? extra : '.') + " " + ucd.getName(clipped, SHORT) + "\t" + UCA.toString(key));
} else {
log.print(source + "\t" + Utility.hex(clipped));
}
@ -384,13 +423,15 @@ public class WriteCollationData implements UCD_Types {
static void addStringX(int x, byte option) {
addStringX(UTF32.valueOf32(x), option);
}
static final char LOW_ACCENT = '\u0325';
static void addStringX(String s, byte option) {
addStringY(s + 'a', option);
addStringY(s + 'A', option);
addStringY(s + 'á', option);
addStringY(s + 'b', option);
addStringY(s + '\u0325', option);
addStringY(s + LOW_ACCENT, option);
addStringY(s + '!', option);
}
@ -527,7 +568,7 @@ public class WriteCollationData implements UCD_Types {
if (!arraysMatch(kenCes, kenLen, markCes, markLen)) {
int kenCLen = fixCompatibilityCE(s, true, kenComp, true);
String comp = collator.ceToString(kenComp, kenCLen);
String comp = CEList.toString(kenComp, kenCLen);
if (arraysMatch(kenCes, kenLen, kenComp, kenCLen)) {
forLater.put((char)(COMPRESSED | type) + s, comp);
@ -567,10 +608,10 @@ public class WriteCollationData implements UCD_Types {
String comp = (String)forLater.get(key);
int kenLen = collator.getCEs(s, decompType, kenCes);
String kenStr = collator.ceToString(kenCes, kenLen);
String kenStr = CEList.toString(kenCes, kenLen);
int markLen = fixCompatibilityCE(s, true, markCes, false);
String markStr = collator.ceToString(markCes, markLen);
String markStr = CEList.toString(markCes, markLen);
if ((type & COMPRESSED) != 0) {
log.println("COMPRESSED #" + (++count) + ": " + ucd.getCodeAndName(s));
@ -589,7 +630,7 @@ public class WriteCollationData implements UCD_Types {
log.println("NFD : " + ucd.getCodeAndName(nfd));
}
//kenCLen = collator.getCEs(decomp, true, kenComp);
//log.println("decomp ce: " + collator.ceToString(kenComp, kenCLen));
//log.println("decomp ce: " + CEList.toString(kenComp, kenCLen));
}
log.println();
}
@ -785,7 +826,7 @@ public class WriteCollationData implements UCD_Types {
if (s.length() > 1) {
diLog.println(Utility.hex(s, " ")
+ ";\t #" + collator.ceToString(ces, len)
+ ";\t #" + CEList.toString(ces, len)
+ " ( " + s + " )"
+ " " + ucd.getName(s));
}
@ -859,7 +900,7 @@ public class WriteCollationData implements UCD_Types {
ccc = UTF32.char32At(s,kk);
byte cat = ucd.getCategory(ccc);
if (cat == Cf || cat == Cc || cat == Zs || cat == Zl || cat == Zp) {
sortedCodes.add(UCA.ceToString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s));
sortedCodes.add(CEList.toString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s));
break;
}
}
@ -882,7 +923,7 @@ public class WriteCollationData implements UCD_Types {
if (collator.isVariable(ce)) haveMixture |= 1;
else haveMixture |= 2;
if (haveMixture == 3) {
mixedCEs.add(UCA.ceToString(ces, len) + "\t" + ucd.getCodeAndName(s));
mixedCEs.add(CEList.toString(ces, len) + "\t" + ucd.getCodeAndName(s));
}
}
}
@ -1030,7 +1071,7 @@ public class WriteCollationData implements UCD_Types {
ccc = UTF32.char32At(s,kk);
byte cat = ucd.getCategory(ccc);
if (cat == Cf || cat == Cc || cat == Zs || cat == Zl || cat == Zp) {
sortedCodes.add(UCA.ceToString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s));
sortedCodes.add(CEList.toString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s));
break;
}
}
@ -1053,7 +1094,7 @@ public class WriteCollationData implements UCD_Types {
if (collator.isVariable(ce)) haveMixture |= 1;
else haveMixture |= 2;
if (haveMixture == 3) {
mixedCEs.add(UCA.ceToString(ces, len) + "\t" + ucd.getCodeAndName(s));
mixedCEs.add(CEList.toString(ces, len) + "\t" + ucd.getCodeAndName(s));
}
}
}
@ -1130,8 +1171,8 @@ public class WriteCollationData implements UCD_Types {
+ "\t" + head
//+ "\t" + Utility.hex(oldWeight)
//+ " => " + Utility.hex(newWeight)
+ "\t" + collator.ceToString(ces, len)
+ (doNew ? " => " + collator.ceToString(newCes, newLen) : "")
+ "\t" + CEList.toString(ces, len)
+ (doNew ? " => " + CEList.toString(newCes, newLen) : "")
+ "\t( " + src + " )"
+ "\t" + ucd.getName(src)
);
@ -1198,7 +1239,7 @@ public class WriteCollationData implements UCD_Types {
if (false) {
int len2 = collator.getCEs("\u2474", true, ces);
System.out.println(UCA.ceToString(ces, len2));
System.out.println(CEList.toString(ces, len2));
String a = collator.getSortKey("a");
String b = collator.getSortKey("A");
@ -1442,9 +1483,9 @@ F900..FAFF; CJK Compatibility Ideographs
if (false) System.out.println(
collator.ceToString(lastCE) + " "
+ collator.ceToString(ce) + " "
+ collator.ceToString(nextCE) + " "
CEList.toString(lastCE) + " "
+ CEList.toString(ce) + " "
+ CEList.toString(nextCE) + " "
+ ucd.getCodeAndName(chr)
);
@ -1513,7 +1554,7 @@ F900..FAFF; CJK Compatibility Ideographs
*/
if (chr.equals("\u2F00")) {
System.out.println(UCA.ceToString(ces, len));
System.out.println(CEList.toString(ces, len));
}
// There are double-CEs, so we have to know what the length of the first bit is.
@ -1561,7 +1602,7 @@ F900..FAFF; CJK Compatibility Ideographs
if (expansion.length() > 0) log.print(" / " + quoteOperand(expansion));
if (option == WITH_NAMES) {
log.print("\t# "
+ collator.ceToString(ces, len) + " "
+ CEList.toString(ces, len) + " "
+ ucd.getCodeAndName(chr));
if (expansion.length() > 0) log.print(" / " + Utility.hex(expansion));
}
@ -1801,7 +1842,7 @@ F900..FAFF; CJK Compatibility Ideographs
// we failed completely. Print error message, and bail
System.out.println("No back map for " + collator.ceToString(ces[i])
System.out.println("No back map for " + CEList.toString(ces[i])
+ " from " + CEList.toString(ces, len));
System.out.println("\t" + ucd.getCodeAndName(chr)
+ " => " + ucd.getCodeAndName(nfkdNew.normalize(chr))
@ -2126,6 +2167,7 @@ F900..FAFF; CJK Compatibility Ideographs
continue;
}
canIt.setSource(key);
boolean first = true;
while (true) {
String s = canIt.next();
@ -2134,9 +2176,6 @@ F900..FAFF; CJK Compatibility Ideographs
if (contentsForCanonicalIteration.contains(s)) continue;
if (additionalSet.contains(s)) continue;
if (s.equals("\u01EC")) {
System.out.println("01ec");
}
// Skip anything that is not FCD.
if (!NFD.isFCD(s)) continue;
@ -2234,7 +2273,7 @@ F900..FAFF; CJK Compatibility Ideographs
log.println("# - Differs from previous version in that MAX value was introduced at 1F.");
log.println("# All tertiary values are shifted down by 1, filling the gap at 7!");
int firstImplicit = getImplicitPrimary(UCA.CJK_BASE) >>> 24;
int firstImplicit = getImplicitPrimary(CJK_BASE) >>> 24;
int lastImplicit = getImplicitPrimary(0x10FFFF) >>> 24;
log.println("[FIRST_IMPLICIT= " + Utility.hex(firstImplicit) + "]");
log.println("[LAST_IMPLICIT= " + Utility.hex(lastImplicit) + "]");
@ -2285,13 +2324,15 @@ F900..FAFF; CJK Compatibility Ideographs
int sec = UCA.getSecondary(ces[q]);
int ter = UCA.getTertiary(ces[q]);
oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16);
oldStr.append(CEList.toString(ces[q]));// + "," + Integer.toString(ces[q],16);
// special treatment for unsupported!
if (UCA.isImplicitLeadPrimary(pri)) {
System.out.println("DEBUG: " + CEList.toString(ces, len)
+ ", Current: " + q + ", " + ucd.getCodeAndName(chr));
++q;
oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16);
oldStr.append(CEList.toString(ces[q]));// + "," + Integer.toString(ces[q],16);
int pri2 = UCA.getPrimary(ces[q]);
// get old code point
@ -2301,7 +2342,7 @@ F900..FAFF; CJK Compatibility Ideographs
// double check results!
int[] testImplicit = new int[2];
UCA.CodepointToImplicit(cp, testImplicit);
collator.CodepointToImplicit(cp, testImplicit);
boolean gotError = pri != testImplicit[0] || pri2 != testImplicit[1];
if (gotError) {
System.out.println("ERROR");
@ -2360,7 +2401,7 @@ F900..FAFF; CJK Compatibility Ideographs
}
if (nonePrinted) {
log.print("[,,]");
oldStr.append(UCA.ceToString(0));
oldStr.append(CEList.toString(0));
}
longLog.print(" # " + oldStr + " # " + ucd.getName(UTF16.charAt(chr, 0)));
log.println();
@ -2386,7 +2427,7 @@ F900..FAFF; CJK Compatibility Ideographs
boolean lastOne = false;
for (int i = 0; i < 0x10FFFF; ++i) {
boolean thisOne = UCA.isCJK(i) || UCA.isCJK_AB(i);
boolean thisOne = ucd.isCJK_BASE(i) || ucd.isCJK_AB(i);
if (thisOne != lastOne) {
summary.println("# Implicit Cusp: CJK=" + lastOne + ": " + Utility.hex(i-1) + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(i-1)));
summary.println("# Implicit Cusp: CJK=" + thisOne + ": " + Utility.hex(i) + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(i)));
@ -2425,7 +2466,7 @@ F900..FAFF; CJK Compatibility Ideographs
summary.print("# " + Utility.hex(i) + ": (" + Utility.hex(newval) + ") "
+ Utility.hex(sampleEq[i]) + " ");
for (int q = 0; q < len; ++q) {
summary.print(UCA.ceToString(ces[q]));
summary.print(CEList.toString(ces[q]));
}
summary.println(" " + ucd.getName(sampleEq[i]));
}
@ -2499,24 +2540,24 @@ F900..FAFF; CJK Compatibility Ideographs
*/
static int swapCJK(int i) {
if (i >= UCA.CJK_BASE) {
if (i < UCA.CJK_LIMIT) return i - UCA.CJK_BASE;
if (i >= CJK_BASE) {
if (i < CJK_LIMIT) return i - CJK_BASE;
if (i < UCA.CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
if (i < UCA.CJK_COMPAT_USED_LIMIT) return i - UCA.CJK_COMPAT_USED_BASE
+ (UCA.CJK_LIMIT - UCA.CJK_BASE);
if (i < UCA.CJK_B_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
+ (CJK_LIMIT - CJK_BASE);
if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
if (i < UCA.CJK_B_LIMIT) return i; // non-BMP-CJK
if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
return i + NON_CJK_OFFSET; // non-CJK
}
if (i < UCA.CJK_A_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
if (i < UCA.CJK_A_LIMIT) return i - UCA.CJK_A_BASE
+ (UCA.CJK_LIMIT - UCA.CJK_BASE)
+ (UCA.CJK_COMPAT_USED_LIMIT - UCA.CJK_COMPAT_USED_BASE);
if (i < CJK_A_LIMIT) return i - CJK_A_BASE
+ (CJK_LIMIT - CJK_BASE)
+ (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
return i + NON_CJK_OFFSET; // non-CJK
}
@ -2642,14 +2683,14 @@ static int swapCJK(int i) {
oldPrimary = newPrimary;
}
showImplicit("# First CJK", UCA.CJK_BASE);
showImplicit("# Last CJK", UCA.CJK_LIMIT-1);
showImplicit("# First CJK-compat", UCA.CJK_COMPAT_USED_BASE);
showImplicit("# Last CJK-compat", UCA.CJK_COMPAT_USED_LIMIT-1);
showImplicit("# First CJK_A", UCA.CJK_A_BASE);
showImplicit("# Last CJK_A", UCA.CJK_A_LIMIT-1);
showImplicit("# First CJK_B", UCA.CJK_B_BASE);
showImplicit("# Last CJK_B", UCA.CJK_B_LIMIT-1);
showImplicit("# First CJK", CJK_BASE);
showImplicit("# Last CJK", CJK_LIMIT-1);
showImplicit("# First CJK-compat", CJK_COMPAT_USED_BASE);
showImplicit("# Last CJK-compat", CJK_COMPAT_USED_LIMIT-1);
showImplicit("# First CJK_A", CJK_A_BASE);
showImplicit("# Last CJK_A", CJK_A_LIMIT-1);
showImplicit("# First CJK_B", CJK_B_BASE);
showImplicit("# Last CJK_B", CJK_B_LIMIT-1);
showImplicit("# First Other Implicit", 0);
showImplicit("# Last Other Implicit", 0x10FFFF);
@ -2667,9 +2708,9 @@ static int swapCJK(int i) {
// separate the three groups
if (UCA.isCJK(i) || UCA.CJK_COMPAT_USED_BASE <= i && i < UCA.CJK_COMPAT_USED_LIMIT) {
if (ucd.isCJK_BASE(i) || CJK_COMPAT_USED_BASE <= i && i < CJK_COMPAT_USED_LIMIT) {
if (batch != 0) continue;
} else if (UCA.isCJK_AB(i)) {
} else if (ucd.isCJK_AB(i)) {
if (batch != 1) continue;
} else if (batch != 2) continue;
@ -2993,7 +3034,7 @@ static int swapCJK(int i) {
for (char ch = 0; ch < 0xFFFF; ++ch) {
byte type = collator.getCEType(ch);
if (type < UCA.FIXED_CE) {
if (type < FIXED_CE) {
int len = collator.getCEs(String.valueOf(ch), true, ces);
int primary = UCA.getPrimary(ces[0]);
if (primary < variableHigh) continue;
@ -3088,36 +3129,22 @@ static int swapCJK(int i) {
System.out.println("Sorting");
for (int i = 0; i <= 0xFFFF; ++i) {
if (EXCLUDE_UNSUPPORTED && !collator.found.get(i)) continue;
if (EXCLUDE_UNSUPPORTED && !collator.found.contains(i)) continue;
if (0xD800 <= i && i <= 0xF8FF) continue; // skip surrogates and private use
//if (0xA000 <= c && c <= 0xA48F) continue; // skip YI
addString(UTF32.valueOf32(i), option);
}
Hashtable multiTable = collator.getContracting();
Enumeration enum = multiTable.keys();
while (enum.hasMoreElements()) {
addString((String)enum.nextElement(), option);
}
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null);
cc.enableSamples();
for (int i = 0; i < extraConformanceTests.length; ++i) { // put in sample non-characters
addString(extraConformanceTests[i], option);
while (true) {
String s = cc.next();
if (s == null) break;
addString(s, option);
}
for (int i = 0; i < extraConformanceRanges.length; ++i) {
int start = extraConformanceRanges[i][0];
int end = extraConformanceRanges[i][1];
int increment = ((end - start + 1) / 303) + 1;
//System.out.println("Range: " + start + ", " + end + ", " + increment);
addString(start, option);
for (int j = start+1; j < end-1; j += increment) {
addString(j, option);
addString(j+1, option);
}
addString(end-1, option);
addString(end, option);
}
System.out.println("Total: " + sortedD.size());
Iterator it;

View file

@ -5,12 +5,14 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java,v $
* $Date: 2002/05/31 01:41:03 $
* $Revision: 1.7 $
* $Date: 2002/06/15 02:47:12 $
* $Revision: 1.8 $
*
*******************************************************************************
*/
WARNING: OLD FILE. DON"T COMPILE.
package com.ibm.text.UCA;
import java.util.*;
@ -21,6 +23,7 @@ import com.ibm.text.UCD.*;
import com.ibm.text.utility.*;
public class WriteHTMLCollation implements UCD_Types {
public static final String copyright =
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
@ -74,8 +77,8 @@ public class WriteHTMLCollation implements UCD_Types {
*/
// DO FOLLOWING
writeConformance("CollationTest_NON_IGNORABLE.txt", UCA.NON_IGNORABLE);
writeConformance("CollationTest_SHIFTED.txt", UCA.SHIFTED);
//writeConformance("CollationTest_NON_IGNORABLE.txt", UCA.NON_IGNORABLE);
//writeConformance("CollationTest_SHIFTED.txt", UCA.SHIFTED);
// SKIP BELOW
if (true) return;
@ -178,7 +181,7 @@ public class WriteHTMLCollation implements UCD_Types {
}
return result.toString();
}
/*
static void writeConformance(String filename, byte option) throws IOException {
PrintWriter log = Utility.openPrintWriter(filename);
@ -193,6 +196,7 @@ public class WriteHTMLCollation implements UCD_Types {
addStringX(c, option);
}
Hashtable multiTable = collator.getContracting();
Enumeration enum = multiTable.keys();
while (enum.hasMoreElements()) {
@ -248,7 +252,8 @@ public class WriteHTMLCollation implements UCD_Types {
sortedD.clear();
System.out.println("Done");
}
*/
static void addStringX(int x, byte option) {
addStringX(String.valueOf((char)x), option);
}
@ -382,7 +387,7 @@ public class WriteHTMLCollation implements UCD_Types {
if (!arraysMatch(kenCes, kenLen, markCes, markLen)) {
int kenCLen = fixCompatibilityCE(s, true, kenComp, true);
String comp = collator.ceToString(kenComp, kenCLen);
String comp = CEList.toString(kenComp, kenCLen);
if (arraysMatch(kenCes, kenLen, kenComp, kenCLen)) {
forLater.put((char)(COMPRESSED | type) + s, comp);
@ -422,10 +427,10 @@ public class WriteHTMLCollation implements UCD_Types {
String comp = (String)forLater.get(key);
int kenLen = collator.getCEs(s, decompType, kenCes);
String kenStr = collator.ceToString(kenCes, kenLen);
String kenStr = CEList.toString(kenCes, kenLen);
int markLen = fixCompatibilityCE(s, true, markCes, false);
String markStr = collator.ceToString(markCes, markLen);
String markStr = CEList.toString(markCes, markLen);
if ((type & COMPRESSED) != 0) {
log.println("COMPRESSED #" + (++count) + ": " + ucd.getCodeAndName(s));
@ -444,7 +449,7 @@ public class WriteHTMLCollation implements UCD_Types {
log.println("NFD : " + ucd.getCodeAndName(nfdstr));
}
//kenCLen = collator.getCEs(decomp, true, kenComp);
//log.println("decomp ce: " + collator.ceToString(kenComp, kenCLen));
//log.println("decomp ce: " + CEList.toString(kenComp, kenCLen));
}
log.println();
}
@ -569,7 +574,7 @@ public class WriteHTMLCollation implements UCD_Types {
{
int len2 = collator.getCEs("\u2474", true, ces);
System.out.println(UCA.ceToString(ces, len2));
System.out.println(CEList.toString(ces, len2));
String a = collator.getSortKey("a");
String b = collator.getSortKey("A");
@ -640,7 +645,7 @@ public class WriteHTMLCollation implements UCD_Types {
else if (collator.getTertiary(ce) != collator.getTertiary(lastCE)) relation = " <<<";
lastCE = ce;
if (chr.equals("\u2474")) {
System.out.println(UCA.ceToString(ces, len));
System.out.println(CEList.toString(ces, len));
}
// check expansions
@ -653,7 +658,7 @@ public class WriteHTMLCollation implements UCD_Types {
int probe = ces[i];
String s = getFromBackMap(backMap, probe);
if (s == null) {
System.out.println("No back map for " + collator.ceToString(ces[i])
System.out.println("No back map for " + CEList.toString(ces[i])
+ ": " + ucd.getCodeAndName(chr));
expansion += "[" + Utility.hex(ces[i]) + "]";
} else {
@ -943,7 +948,7 @@ public class WriteHTMLCollation implements UCD_Types {
}
if (sampleEq[sec] == null) sampleEq[sec] = chr;
if (sampleEq[ter] == null) sampleEq[ter] = chr;
oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16);
oldStr.append(CEList.toString(ces[q]));// + "," + Integer.toString(ces[q],16);
int np = primaryDelta[UCA.getPrimary(ces[q])];
hexBytes(np, newPrimary);
hexBytes(fixSecondary(UCA.getSecondary(ces[q])), newSecondary);
@ -968,7 +973,7 @@ public class WriteHTMLCollation implements UCD_Types {
}
if (nonePrinted) {
log.print("[,,]");
oldStr.append(UCA.ceToString(0));
oldStr.append(CEList.toString(0));
}
log.println(" # " + oldStr + " # " + ucd.getName(chr.charAt(0)));
lastChr = chr;
@ -1017,7 +1022,7 @@ public class WriteHTMLCollation implements UCD_Types {
summary.print("# " + Utility.hex(i) + ": (" + Utility.hex(newval) + ") "
+ Utility.hex(sampleEq[i]) + " ");
for (int q = 0; q < len; ++q) {
summary.print(UCA.ceToString(ces[q]));
summary.print(CEList.toString(ces[q]));
}
summary.println(" " + ucd.getName(sampleEq[i]));
}
@ -1438,7 +1443,7 @@ public class WriteHTMLCollation implements UCD_Types {
for (int i = 0; i <= 0xFFFF; ++i) {
char c = (char)i;
if (EXCLUDE_UNSUPPORTED && !collator.found.get(c)) continue;
if (EXCLUDE_UNSUPPORTED && !collator.found.contains(c)) continue;
if (0xD800 <= i && i <= 0xF8FF) continue; // skip surrogates and private use
//if (0xA000 <= c && c <= 0xA48F) continue; // skip YI
addString(String.valueOf(c), option);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2002/06/13 21:14:05 $
* $Revision: 1.15 $
* $Date: 2002/06/15 02:47:14 $
* $Revision: 1.16 $
*
*******************************************************************************
*/
@ -63,6 +63,7 @@ public final class Main implements UCD_Types {
else if (arg.equalsIgnoreCase("diffIgnorable")) VerifyUCD.diffIgnorable();
else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
else if (arg.equalsIgnoreCase("onetime")) VerifyUCD.oneTime();
else if (arg.equalsIgnoreCase("verifyNormalizationStability")) VerifyUCD.verifyNormalizationStability();
else if (arg.equalsIgnoreCase("definitionTransliterator")) GenerateHanTransliterator.main(0);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2002/06/13 21:14:05 $
* $Revision: 1.13 $
* $Date: 2002/06/15 02:47:13 $
* $Revision: 1.14 $
*
*******************************************************************************
*/
@ -146,7 +146,7 @@ public final class UCD implements UCD_Types {
* Get the character names for the code points in a string, separated by ", "
*/
public String getName(String s, byte style) {
if (s.length() == 1) return get(s.charAt(0), true).name;
if (s.length() == 1) return getName(s.charAt(0), style);
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
@ -182,15 +182,15 @@ public final class UCD implements UCD_Types {
/**
* Get the name and number (U+xxxx NAME) for a code point
*/
public String getCodeAndName(int codePoint) {
return getCode(codePoint) + " " + getName(codePoint);
public String getCodeAndName(int codePoint, byte type) {
return getCode(codePoint) + " " + getName(codePoint, type);
}
/**
* Get the name and number (U+xxxx NAME) for the code points in a string,
* separated by ", "
*/
public String getCodeAndName(String s) {
public String getCodeAndName(String s, byte type) {
if (s == null || s.length() == 0) return "NULL";
if (s.length() == 1) return getCodeAndName(s.charAt(0)); // fast path
StringBuffer result = new StringBuffer();
@ -203,6 +203,20 @@ public final class UCD implements UCD_Types {
return result.toString();
}
/**
* Get the name and number (U+xxxx NAME) for a code point
*/
public String getCodeAndName(int codePoint) {
return getCodeAndName(codePoint, NORMAL);
}
/**
* Get the name and number (U+xxxx NAME) for a code point
*/
public String getCodeAndName(String s) {
return getCodeAndName(s, NORMAL);
}
/**
* Get the general category
*/
@ -990,10 +1004,20 @@ to guarantee identifier closure.
result = getRaw(codePoint);
if (result == null) {
result = UData.UNASSIGNED;
if (fixStrings) result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
result.name = null; // clean this up, since we reuse UNASSIGNED
result.shortName = null;
if (fixStrings) {
result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
}
}
if (result.shortName != null && result.shortName.length() == 0) {
result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
if (fixStrings) {
if (result.name == null) {
result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
System.out.println("Warning: fixing name for " + result.name);
}
if (result.shortName == null) {
result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
}
}
return result;
case 0x3400: // CJK Ideograph Extension A
@ -1024,6 +1048,8 @@ to guarantee identifier closure.
result = getRaw(rangeStart);
if (result == null) {
result = UData.UNASSIGNED;
result.name = null; // clean this up, since we reuse UNASSIGNED
result.shortName = null;
if (fixStrings) {
result.name = "<reserved-" + Utility.hex(codePoint, 4) + ">";
result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
@ -1047,6 +1073,32 @@ to guarantee identifier closure.
return result;
}
// Neither Mapped nor Composite CJK: [\u3400-\u4DB5\u4E00-\u9FA5\U00020000-\U0002A6D6]
public static final boolean isCJK_AB(int bigChar) {
return (CJK_A_BASE <= bigChar && bigChar < CJK_A_LIMIT
|| CJK_B_BASE <= bigChar && bigChar < CJK_B_LIMIT);
}
public static boolean isCJK_BASE(int cp) {
return (CJK_BASE <= cp && cp < CJK_LIMIT
|| cp == 0xFA0E // compat characters that don't decompose.
|| cp == 0xFA0F
|| cp == 0xFA11
|| cp == 0xFA13
|| cp == 0xFA14
|| cp == 0xFA1F
|| cp == 0xFA21
|| cp == 0xFA23
|| cp == 0xFA24
|| cp == 0xFA27
|| cp == 0xFA28
|| cp == 0xFA29
|| cp == 0xFA2E
|| cp == 0xFA2F
);
}
// Hangul constants
public static final int
@ -1108,7 +1160,7 @@ to guarantee identifier closure.
return 0xFFFF; // no composition
}
static boolean isHangulSyllable(int char1) {
static public boolean isHangulSyllable(int char1) {
return SBase <= char1 && char1 < SLimit;
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
* $Date: 2002/05/29 02:01:00 $
* $Revision: 1.12 $
* $Date: 2002/06/15 02:47:13 $
* $Revision: 1.13 $
*
*******************************************************************************
*/
@ -21,8 +21,17 @@ public interface UCD_Types {
public static final String UCD_DIR = BASE_DIR + "UCD\\";
public static final String BIN_DIR = BASE_DIR + "BIN\\";
public static final String GEN_DIR = BASE_DIR + "GEN\\";
public static final int
CJK_BASE = 0x4E00,
CJK_LIMIT = 0x9FFF+1,
CJK_COMPAT_USED_BASE = 0xFA0E,
CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
CJK_A_BASE = 0x3400,
CJK_A_LIMIT = 0x4DBF+1,
CJK_B_BASE = 0x20000,
CJK_B_LIMIT = 0x2A6DF+1;
static final byte BINARY_FORMAT = 6; // bumped if binary format of UCD changes
// Unicode Property Types

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
* $Date: 2002/06/13 21:14:05 $
* $Revision: 1.4 $
* $Date: 2002/06/15 02:47:12 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -18,7 +18,7 @@ import com.ibm.text.utility.*;
class UData implements UCD_Types {
String name;
String shortName = ""; // cache
String shortName; // cache
String decompositionMapping;
String simpleUppercase;
String simpleLowercase;

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
* $Date: 2002/06/13 21:14:05 $
* $Revision: 1.15 $
* $Date: 2002/06/15 02:47:12 $
* $Revision: 1.16 $
*
*******************************************************************************
*/
@ -27,6 +27,27 @@ import com.ibm.text.utility.*;
import java.text.NumberFormat;
public class VerifyUCD implements UCD_Types {
static void oneTime() {
Default.setUCD();
int[] testSet = {0x10000, 'a', 0xE0000, '\u0221'}; // 10000
for (int i = 0; i < testSet.length; ++i) {
int item = testSet[i];
System.out.println(Default.ucd.getCode(item));
boolean ass = Default.ucd.isAssigned(item);
System.out.println(ass ? " assigned" : " unassigned");
ass = Default.ucd.isAllocated(item);
System.out.println(ass ? " allocated" : " unallocated");
String name = Default.ucd.getName(item, SHORT);
System.out.println(" " + name);
name = Default.ucd.getName(item);
System.out.println(" " + name);
System.out.println();
}
}
static final byte NC = UNUSED_CATEGORY;

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/IntStack.java,v $
* $Date: 2001/09/19 23:33:52 $
* $Revision: 1.3 $
* $Date: 2002/06/15 02:47:14 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -17,30 +17,65 @@ package com.ibm.text.utility;
// Simple stack mechanism, with push, pop and access
// =============================================================
public final class IntStack implements Comparable {
public final class IntStack implements Comparable, Cloneable {
private int[] values;
private int top = 0;
private int first = 0;
public IntStack(int initialSize) {
values = new int[initialSize];
}
public IntStack append(IntStack other) {
// TODO speed up by copying arrays
for (int i = 0; i < other.getTop(); ++i) {
push(other.get(i));
}
return this;
}
public void push(int value) {
public IntStack append(int value) {
return push(value);
}
public int length() {
return top - first;
}
public IntStack push(int value) {
if (top >= values.length) { // must grow?
int[] temp = new int[values.length*2];
System.arraycopy(values,0,temp,0,values.length);
values = temp;
}
values[top++] = value;
return this;
}
public int pop() {
if (top > 0) return values[--top];
if (top > first) {
int result = values[--top];
if (top == first && first > 0) {
top = first = 0;
}
return result;
}
throw new IllegalArgumentException("Stack underflow");
}
public int popFront() {
if (top > first) {
int result = values[first++];
if (top == first) {
top = first = 0;
}
return result;
}
throw new IllegalArgumentException("Stack underflow");
}
public int get(int index) {
if (0 <= index && index < top) return values[index];
if (first <= index && index < top) return values[index];
throw new IllegalArgumentException("Stack index out of bounds");
}
@ -49,22 +84,24 @@ public final class IntStack implements Comparable {
}
public boolean isEmpty() {
return top == 0;
return top - first == 0;
}
public void clear() {
top = 0;
top = first = 0;
}
public int compareTo(Object other) {
IntStack that = (IntStack) other;
int min = top;
if (min < that.top) min = that.top;
for (int i = 0; i < min; ++i) {
int result = values[i] - that.values[i];
int myLen = top - first;
int thatLen = that.top - that.first;
int limit = first + ((myLen < thatLen) ? myLen : thatLen);
int delta = that.first - first;
for (int i = first; i < limit; ++i) {
int result = values[i] - that.values[i + delta];
if (result != 0) return result;
}
return top - that.top;
return myLen - thatLen;
}
public boolean equals(Object other) {
@ -73,9 +110,19 @@ public final class IntStack implements Comparable {
public int hashCode() {
int result = top;
for (int i = 0; i < top; ++i) {
for (int i = first; i < top; ++i) {
result = result * 37 + values[i];
}
return result;
}
public Object clone() {
try {
IntStack result = (IntStack) (super.clone());
result.values = (int[]) result.values.clone();
return result;
} catch (CloneNotSupportedException e) {
throw new IllegalArgumentException("Will never happen");
}
}
}