mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 01:11:02 +00:00
finally did some significant code cleanup on collation. not enough, but it's a start
X-SVN-Rev: 8896
This commit is contained in:
parent
05d2989deb
commit
3940ed8c00
11 changed files with 522 additions and 627 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/CEList.java,v $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2002/06/15 02:47:12 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -165,6 +165,15 @@ public final class CEList implements java.lang.Comparable, UCD_Types {
|
|||
return result.toString();
|
||||
}
|
||||
|
||||
public static String toString(IntStack ces) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < ces.length(); ++i) {
|
||||
if (i != 0) result.append(' ');
|
||||
result.append(toString(ces.get(i)));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static String toString(int ce) {
|
||||
return "[" + Utility.hex(UCA.getPrimary(ce)) + "."
|
||||
+ Utility.hex(UCA.getSecondary(ce)) + "."
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
|
||||
* $Date: 2002/06/04 01:59:01 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2002/06/15 02:47:12 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -18,7 +18,8 @@ import com.ibm.text.utility.*;
|
|||
|
||||
public class Main {
|
||||
static final String UCDVersion = "";
|
||||
static final String[] ICU_FILES = {"writeCollationValidityLog", "FractionalUCA", "writeconformance", "writeconformanceshifted",
|
||||
static final String[] ICU_FILES = {"writeCollationValidityLog", "FractionalUCA",
|
||||
"writeconformance", "writeconformanceshifted",
|
||||
"WriteRules", "WriteRulesWithNames", "WriteRulesXML"};
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
|
||||
* $Date: 2002/06/04 01:58:56 $
|
||||
* $Revision: 1.13 $
|
||||
* $Date: 2002/06/15 02:47:12 $
|
||||
* $Revision: 1.14 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -24,6 +24,7 @@ import com.ibm.text.UCD.Normalizer;
|
|||
import com.ibm.text.UCD.UCD;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
//import com.ibm.text.CollationData.*;
|
||||
|
||||
|
@ -62,7 +63,7 @@ This is because of shared
|
|||
characters between scripts with different directions, like French with Arabic or Greek.
|
||||
*/
|
||||
|
||||
final public class UCA implements Comparator {
|
||||
final public class UCA implements Comparator, UCA_Types {
|
||||
public static final String copyright =
|
||||
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
|
||||
|
||||
|
@ -85,19 +86,13 @@ final public class UCA implements Comparator {
|
|||
// base directory will change depending on the installation
|
||||
public static final String BASE_DIR = "c:\\DATA\\";
|
||||
|
||||
/** Enum for alternate handling */
|
||||
public static final byte SHIFTED = 0, ZEROED = 1, NON_IGNORABLE = 2, SHIFTED_TRIMMED = 3, LAST = 3;
|
||||
|
||||
/**
|
||||
* Used to terminate a list of CEs
|
||||
*/
|
||||
public static final int TERMINATOR = 0xFFFFFFFF; // CE that marks end of string
|
||||
|
||||
|
||||
// =============================================================
|
||||
// Test Settings
|
||||
// =============================================================
|
||||
static final boolean DEBUG = false;
|
||||
static final boolean DEBUG_SHOW_LINE = false;
|
||||
|
||||
static final boolean SHOW_STATS = true;
|
||||
|
||||
static final boolean SHOW_CE = false;
|
||||
|
@ -109,6 +104,7 @@ final public class UCA implements Comparator {
|
|||
static final boolean RECORDING_CHARS = true;
|
||||
|
||||
private UCD ucd;
|
||||
private UCA_Data ucaData;
|
||||
|
||||
// =============================================================
|
||||
// Main Methods
|
||||
|
@ -121,11 +117,7 @@ final public class UCA implements Comparator {
|
|||
*/
|
||||
public UCA(BufferedReader source, String unicodeVersion) throws java.io.IOException {
|
||||
fullData = source == null;
|
||||
|
||||
// clear some tables
|
||||
for (int i = 0; i < collationElements.length; ++i) {
|
||||
collationElements[i] = UNSUPPORTED;
|
||||
}
|
||||
|
||||
// load the normalizer
|
||||
if (toD == null) {
|
||||
toD = new Normalizer(Normalizer.NFD, unicodeVersion);
|
||||
|
@ -134,6 +126,8 @@ final public class UCA implements Comparator {
|
|||
ucd = UCD.make(unicodeVersion);
|
||||
ucdVersion = ucd.getVersion();
|
||||
|
||||
ucaData = new UCA_Data(toD, ucd);
|
||||
|
||||
// either get the full sources, or just a demo set
|
||||
if (fullData) {
|
||||
for (int i = 0; i < KEYS.length; ++i) {
|
||||
|
@ -234,7 +228,7 @@ final public class UCA implements Comparator {
|
|||
}
|
||||
if (SHOW_CE) {
|
||||
if (debugList.length() != 0) debugList.append("/");
|
||||
debugList.append(ceToString(ce));
|
||||
debugList.append(CEList.toString(ce));
|
||||
}
|
||||
|
||||
// add weights
|
||||
|
@ -412,6 +406,35 @@ final public class UCA implements Comparator {
|
|||
return target;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of CEs for a unicode character at a position.
|
||||
* @param sourceString string to make a sort key for.
|
||||
* @param offset position in string
|
||||
* @param decomposition true for UCA, false where the text is guaranteed to be
|
||||
* normalization form C with no combining marks of class 0.
|
||||
* @param output array for output. Must be large enough on entry. When done, is terminated with TERMINATOR.
|
||||
*/
|
||||
public void getCEs(String sourceString, boolean decomposition, IntStack output) {
|
||||
decompositionBuffer.setLength(0);
|
||||
if (decomposition) {
|
||||
toD.normalize(sourceString, decompositionBuffer);
|
||||
} else {
|
||||
decompositionBuffer.append(sourceString);
|
||||
}
|
||||
rearrangeBuffer = EMPTY; // clear the rearrange buffer (thai)
|
||||
index = 0;
|
||||
|
||||
// process CEs, building weight strings
|
||||
while (true) {
|
||||
//fixQuaternatiesPosition = quaternaries.length();
|
||||
int ce = getCE();
|
||||
if (ce == 0) continue;
|
||||
if (ce == TERMINATOR) break;
|
||||
output.push(ce);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a list of CEs for a unicode character at a position.
|
||||
* @param sourceString string to make a sort key for.
|
||||
|
@ -477,14 +500,6 @@ final public class UCA implements Comparator {
|
|||
return strength == 1 ? primarySet : strength == 2 ? secondarySet : tertiarySet;
|
||||
}
|
||||
|
||||
/**
|
||||
* CE Type
|
||||
*/
|
||||
static final byte NORMAL_CE = 0, CONTRACTING_CE = 1, EXPANDING_CE = 2,
|
||||
CJK_CE = 3, CJK_AB_CE = 4, HANGUL_CE = 5, UNSUPPORTED_CE = 7,
|
||||
FIXED_CE = 3;
|
||||
// SURROGATE_CE = 6,
|
||||
|
||||
/**
|
||||
* Returns the char associated with a FIXED value
|
||||
*/
|
||||
|
@ -497,28 +512,7 @@ final public class UCA implements Comparator {
|
|||
* Return the type of the CE
|
||||
*/
|
||||
public byte getCEType(int ch) {
|
||||
|
||||
if (ch > 0xFFFF) ch = UTF16.getLeadSurrogate(ch); // first if expands
|
||||
|
||||
int ce = collationElements[ch];
|
||||
if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return NORMAL_CE;
|
||||
if (ce == UNSUPPORTED) {
|
||||
|
||||
// Special check for Han, Hangul
|
||||
if (isHangul(ch)) return HANGUL_CE;
|
||||
|
||||
if (isCJK(ch)) return CJK_CE;
|
||||
if (isCJK_AB(ch)) return CJK_AB_CE;
|
||||
|
||||
// special check for unsupported surrogate pair, 20 1/8 bits
|
||||
//if (0xD800 <= ch && ch <= 0xDFFF) {
|
||||
// return SURROGATE_CE;
|
||||
//}
|
||||
return UNSUPPORTED_CE;
|
||||
}
|
||||
|
||||
if (ce == CONTRACTING) return CONTRACTING_CE;
|
||||
return EXPANDING_CE;
|
||||
return ucaData.getCEType(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -604,19 +598,11 @@ final public class UCA implements Comparator {
|
|||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Produces a human-readable string for a collation element
|
||||
*/
|
||||
static public String ceToString(int ce) {
|
||||
return "[" + Utility.hex(getPrimary(ce)) + "."
|
||||
+ Utility.hex(getSecondary(ce)) + "."
|
||||
+ Utility.hex(getTertiary(ce)) + "]";
|
||||
}
|
||||
|
||||
/**
|
||||
* Produces a human-readable string for a collation element.
|
||||
* value is terminated by -1!
|
||||
*/
|
||||
/*
|
||||
static public String ceToString(int[] ces, int len) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < len; ++i) {
|
||||
|
@ -624,11 +610,13 @@ final public class UCA implements Comparator {
|
|||
}
|
||||
return result.toString();
|
||||
}
|
||||
&/
|
||||
|
||||
/**
|
||||
* Produces a human-readable string for a collation element.
|
||||
* value is terminated by -1!
|
||||
*/
|
||||
/*
|
||||
static public String ceToString(int[] ces) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; ; ++i) {
|
||||
|
@ -637,7 +625,7 @@ final public class UCA implements Comparator {
|
|||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
static boolean isImplicitLeadCE(int ce) {
|
||||
return isImplicitLeadPrimary(getPrimary(ce));
|
||||
|
@ -670,10 +658,10 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
* and to get the second part use (x & 0xFFFF)
|
||||
*/
|
||||
|
||||
static void CodepointToImplicit(int cp, int[] output) {
|
||||
void CodepointToImplicit(int cp, int[] output) {
|
||||
int base = UNSUPPORTED_OTHER_BASE;
|
||||
if (isCJK(cp)) base = UNSUPPORTED_CJK_BASE;
|
||||
else if (isCJK_AB(cp)) base = UNSUPPORTED_CJK_AB_BASE;
|
||||
if (ucd.isCJK_BASE(cp)) base = UNSUPPORTED_CJK_BASE;
|
||||
else if (ucd.isCJK_AB(cp)) base = UNSUPPORTED_CJK_AB_BASE;
|
||||
output[0] = base + (cp >>> 15);
|
||||
output[1] = (cp & 0x7FFF) | 0x8000;
|
||||
}
|
||||
|
@ -768,6 +756,9 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
// Privates
|
||||
// =============================================================
|
||||
|
||||
|
||||
IntStack expandingStack = new IntStack(10);
|
||||
|
||||
/**
|
||||
* Array used to reorder surrogates to top of 16-bit range, and others down.
|
||||
* Adds 2000 to D800..DFFF, making them F800..FFFF
|
||||
|
@ -847,77 +838,13 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
// Collation Element Memory Data Table Formats
|
||||
// =============================================================
|
||||
|
||||
/**
|
||||
* Used to composed Hangul and Han characters
|
||||
*/
|
||||
|
||||
static final int NEUTRAL_SECONDARY = 0x20;
|
||||
static final int NEUTRAL_TERTIARY = 0x02;
|
||||
|
||||
/**
|
||||
* Temporary buffer used in getSortKey for the decomposed string
|
||||
*/
|
||||
private StringBuffer decompositionBuffer = new StringBuffer();
|
||||
|
||||
/**
|
||||
* The collation element data is stored a couple of different structures.
|
||||
* First is collationElements, which generally contains the 32-bit CE corresponding
|
||||
* to the data. It is directly indexed by character code.<br>
|
||||
* For brevity in the implementation, we just use a flat array.
|
||||
* A real implementation would use a multi-stage table, as described in TUS Section 5.
|
||||
* table of simple collation elements, indexed by char.<br>
|
||||
* Exceptional cases: expanding, contracting, unsupported are handled as described below.
|
||||
*/
|
||||
private int[] collationElements = new int[65536];
|
||||
|
||||
/**
|
||||
* A special bit combination in a CE is used to reserve exception cases. This has the effect
|
||||
* of removing a small number of the primary key values out of the 65536 possible.
|
||||
*/
|
||||
private static final int EXCEPTION_CE_MASK = 0xF8000000;
|
||||
|
||||
|
||||
/**
|
||||
* Any unsupported characters (those not in the UCA data tables)
|
||||
* are marked with a exception bit combination
|
||||
* so that they can be treated specially.<br>
|
||||
* There are at least 34 values, so that we can use a range for surrogates
|
||||
* However, we do add to the first weight if we have surrogate pairs!
|
||||
*/
|
||||
private static final int UNSUPPORTED_CJK_BASE = 0xFF40;
|
||||
private static final int UNSUPPORTED_CJK_AB_BASE = 0xFF80;
|
||||
private static final int UNSUPPORTED_OTHER_BASE = 0xFFC0;
|
||||
|
||||
private static final int UNSUPPORTED_BASE = UNSUPPORTED_CJK_BASE;
|
||||
private static final int UNSUPPORTED_LIMIT = UNSUPPORTED_OTHER_BASE + 0x40;
|
||||
|
||||
private static final int UNSUPPORTED = makeKey(UNSUPPORTED_BASE, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
|
||||
|
||||
// was 0xFFC20101;
|
||||
|
||||
/**
|
||||
* Contracting characters are marked with a exception bit combination
|
||||
* in the collationElement table.
|
||||
* This means that they are the first character of a contraction, and need
|
||||
* to be looked up (with following characters) in the contractingTable.<br>
|
||||
* This isn't a MASK since there is exactly one value.
|
||||
*/
|
||||
private static final int CONTRACTING = 0xFA310000;
|
||||
|
||||
/**
|
||||
* Expanding characters are marked with a exception bit combination
|
||||
* in the collationElement table.
|
||||
* This means that they map to more than one CE, which is looked up in
|
||||
* the expansionTable by index. See EXCEPTION_INDEX_MASK
|
||||
*/
|
||||
private static final int EXPANDING_MASK = 0xFA300000; // marks expanding range start
|
||||
|
||||
/**
|
||||
* This mask is used to get the index from an EXPANDING exception.
|
||||
* The contracting characters can also make use of this in a future optimization.
|
||||
*/
|
||||
static final int EXCEPTION_INDEX_MASK = 0x0000FFFF;
|
||||
|
||||
/**
|
||||
* We take advantage of the variables being in a closed range to save a bit per CE.
|
||||
* The low and high values are initially set to be at the opposite ends of the range,
|
||||
|
@ -931,27 +858,18 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
private int variableLowCE; // used for testing against
|
||||
private int variableHighCE; // used for testing against
|
||||
|
||||
/**
|
||||
* Although a single character can expand into multiple CEs, we don't want to burden
|
||||
* the normal case with the storage. So, they get a special value in the collationElements
|
||||
* array. This value has a distinct primary weight, followed by an index into a separate
|
||||
* table called expandingTable. All of the CEs in that table, up to a TERMINATOR value
|
||||
* will be used for the expansion. The implementation is as a stack; this just makes it
|
||||
* easy to generate.
|
||||
*/
|
||||
private IntStack expandingTable = new IntStack(3600); // initial number is from compKeys
|
||||
|
||||
/**
|
||||
* For now, this is just a simple mapping of strings to collation elements.
|
||||
* The implementation depends on the contracting characters being "completed",
|
||||
* so that it can be efficiently determined when to stop looking.
|
||||
*/
|
||||
private Hashtable contractingTable = new Hashtable();
|
||||
/*
|
||||
|
||||
/**
|
||||
* Special char value that means failed or terminated
|
||||
*/
|
||||
private static final char NOT_A_CHAR = '\uFFFF';
|
||||
private void fixSurrogateContraction(char ch) {
|
||||
//if (DEBUGCHAR) System.out.println(Utility.hex(ch) + ": " + line.substring(0, position[0]) + "|" + line.substring(position[0]));
|
||||
if (ch == NOT_A_CHAR || !UTF16.isLeadSurrogate(ch)) return;
|
||||
String chs = String.valueOf(ch);
|
||||
Object probe = contractingTable.get(chs);
|
||||
if (probe != null) return;
|
||||
contractingTable.put(chs, new Integer(UNSUPPORTED));
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
/**
|
||||
* Marks whether we are using the full data set, or an abbreviated version for
|
||||
|
@ -965,11 +883,6 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
// Made part of the object to avoid reallocating each time.
|
||||
// =============================================================
|
||||
|
||||
/**
|
||||
* Stack for expanding characters
|
||||
*/
|
||||
private IntStack expandingStack = new IntStack(100);
|
||||
|
||||
/**
|
||||
* Temporary buffers used in getSortKey to store weights
|
||||
* these are NOT strings of Unicode characters--they are
|
||||
|
@ -990,8 +903,6 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
* Temporary with requested decomposition
|
||||
*/
|
||||
boolean storedDecomposition;
|
||||
int hangulHackBottom;
|
||||
int hangulHackTop;
|
||||
|
||||
/**
|
||||
* Used for supporting Thai rearrangement
|
||||
|
@ -1015,7 +926,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
* (normalized) character code.
|
||||
*/
|
||||
private int getCE() {
|
||||
if (!expandingStack.isEmpty()) return expandingStack.pop();
|
||||
if (!expandingStack.isEmpty()) return expandingStack.popFront();
|
||||
char ch;
|
||||
|
||||
// Fetch next character. Handle rearrangement for Thai, etc.
|
||||
|
@ -1037,190 +948,56 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
}
|
||||
}
|
||||
|
||||
int ce = collationElements[ch];
|
||||
|
||||
// Hangul tailoring hack
|
||||
//if (!storedDecomposition && hangulHackBottom <= ce && ce < hangulHackTop) return fixJamo(ch, ce); // hard coded fix!!
|
||||
|
||||
// if the CE is not exceptional (unsupported, contracting, expanding) we are done.
|
||||
if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return ce;
|
||||
|
||||
if (ce == UNSUPPORTED) {
|
||||
int bigChar = ch;
|
||||
index = ucaData.get(ch, decompositionBuffer, index, expandingStack);
|
||||
int ce = expandingStack.popFront(); // pop first (guaranteed to exist!)
|
||||
if (ce == UNSUPPORTED_FLAG) {
|
||||
return handleUnsupported(ch);
|
||||
}
|
||||
return ce;
|
||||
}
|
||||
|
||||
private int handleUnsupported(char ch) {
|
||||
int bigChar = ch;
|
||||
|
||||
// Special check for Hangul
|
||||
if (isHangul(bigChar)) {
|
||||
// MUST DECOMPOSE!!
|
||||
hangulBuffer = new StringBuffer();
|
||||
decomposeHangul(bigChar, hangulBuffer);
|
||||
return getCE();
|
||||
// RECURSIVE!!!
|
||||
}
|
||||
// Special check for Hangul
|
||||
if (ucd.isHangulSyllable(bigChar)) {
|
||||
// MUST DECOMPOSE!!
|
||||
hangulBuffer = new StringBuffer();
|
||||
decomposeHangul(bigChar, hangulBuffer);
|
||||
return getCE();
|
||||
// RECURSIVE!!!
|
||||
}
|
||||
|
||||
// special check and fix for unsupported surrogate pair, 20 1/8 bits
|
||||
if (0xD800 <= bigChar && bigChar <= 0xDFFF) {
|
||||
// ignore unmatched surrogates (e.g. return zero)
|
||||
if (bigChar >= 0xDC00 || index >= decompositionBuffer.length()) return 0; // unmatched
|
||||
int ch2 = decompositionBuffer.charAt(index);
|
||||
if (ch2 < 0xDC00 || 0xDFFF < ch2) return 0; // unmatched
|
||||
index++; // skip next char
|
||||
bigChar = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00); // extract value
|
||||
}
|
||||
|
||||
|
||||
if (ucd.isNoncharacter(bigChar)) { // illegal code value, ignore!!
|
||||
return 0;
|
||||
}
|
||||
if (ucd.isNoncharacter(bigChar)) { // illegal code value, ignore!!
|
||||
return 0;
|
||||
}
|
||||
|
||||
// special check and fix for unsupported surrogate pair, 20 1/8 bits
|
||||
if (0xD800 <= bigChar && bigChar <= 0xDFFF) {
|
||||
// ignore unmatched surrogates (e.g. return zero)
|
||||
if (bigChar >= 0xDC00 || index >= decompositionBuffer.length()) return 0; // unmatched
|
||||
int ch2 = decompositionBuffer.charAt(index);
|
||||
if (ch2 < 0xDC00 || 0xDFFF < ch2) return 0; // unmatched
|
||||
index++; // skip next char
|
||||
bigChar = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00); // extract value
|
||||
}
|
||||
|
||||
// find the implicit values; returned in 0 and 1
|
||||
int[] implicit = new int[2];
|
||||
CodepointToImplicit(bigChar, implicit);
|
||||
// find the implicit values; returned in 0 and 1
|
||||
int[] implicit = new int[2];
|
||||
CodepointToImplicit(bigChar, implicit);
|
||||
|
||||
// Now compose the two keys
|
||||
// first push BBBB, which is #1
|
||||
// Now compose the two keys
|
||||
|
||||
// push BBBB
|
||||
|
||||
expandingStack.push(makeKey(implicit[1], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY));
|
||||
expandingStack.push(makeKey(implicit[1], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY));
|
||||
|
||||
// return AAAA
|
||||
|
||||
// now return AAAA, which is #0
|
||||
|
||||
return makeKey(implicit[0], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
|
||||
return makeKey(implicit[0], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
|
||||
|
||||
|
||||
}
|
||||
if (ce == CONTRACTING) {
|
||||
// Contracting is probably the most interesting (read "tricky") part
|
||||
// of the algorithm.
|
||||
// First get longest substring that is in the contracting table.
|
||||
// For simplicity, we use a hash table for contracting.
|
||||
// There are much better optimizations,
|
||||
// but they take a more complicated build algorithm than we want to show here.
|
||||
// NOTE: We are guaranteed that the character itself is in the contracting table because
|
||||
// of the build process.
|
||||
String probe = String.valueOf(ch);
|
||||
Object value = contractingTable.get(probe);
|
||||
if (value == null) throw new IllegalArgumentException("Missing value for " + Utility.hex(ch));
|
||||
|
||||
// We loop, trying to add successive characters to the longest substring.
|
||||
while (index < decompositionBuffer.length()) {
|
||||
char ch2 = decompositionBuffer.charAt(index);
|
||||
|
||||
// see whether the current string plus the next char are in
|
||||
// the contracting table.
|
||||
String newProbe = probe + ch2;
|
||||
Object newValue = contractingTable.get(newProbe);
|
||||
if (newValue == null) break; // stop if not in table.
|
||||
|
||||
// We succeeded--so update our new values, and set index
|
||||
// and quaternary to indicate that we swallowed another character.
|
||||
probe = newProbe;
|
||||
value = newValue;
|
||||
index++;
|
||||
}
|
||||
|
||||
// Now, see if we can add any combining marks
|
||||
short lastCan = 0;
|
||||
for (int i = index; i < decompositionBuffer.length(); ++i) {
|
||||
// We only take certain characters. They have to be accents,
|
||||
// and they have to not be blocked.
|
||||
// Unlike above, if we don't find a match (and it was an accent!)
|
||||
// then we don't stop, we continue looping.
|
||||
char ch2 = decompositionBuffer.charAt(i);
|
||||
short can = toD.getCanonicalClass(ch2);
|
||||
if (can == 0) break; // stop with any zero (non-accent)
|
||||
if (can == lastCan) continue; // blocked if same class as last
|
||||
lastCan = can; // remember for next time
|
||||
|
||||
// Now see if we can successfully add it onto our string
|
||||
// and find it in the contracting table.
|
||||
String newProbe = probe + ch2;
|
||||
Object newValue = contractingTable.get(newProbe);
|
||||
if (newValue == null) continue;
|
||||
|
||||
// We succeeded--so update our new values, remove the char, and update
|
||||
// quaternary to indicate that we swallowed another character.
|
||||
probe = newProbe;
|
||||
value = newValue;
|
||||
decompositionBuffer.setCharAt(i,'\u0000'); // zero char
|
||||
}
|
||||
|
||||
// we are all done, and can extract the CE from the last value set.
|
||||
ce = ((Integer)value).intValue();
|
||||
// if the CE is not exceptional (unsupported expanding) we are done.
|
||||
// BTW we will never have a contracting CE at this point.
|
||||
if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return ce;
|
||||
// otherwise fall through to expansion
|
||||
}
|
||||
// expanding, so copy list of items onto stack
|
||||
int index = ce & EXCEPTION_INDEX_MASK; // get index
|
||||
// copy onto stack from index until reach TERMINATOR
|
||||
while (true) {
|
||||
ce = expandingTable.get(index++);
|
||||
if (ce == TERMINATOR) break;
|
||||
expandingStack.push(ce);
|
||||
}
|
||||
return expandingStack.pop(); // pop last (guaranteed to exist!)
|
||||
}
|
||||
|
||||
// Neither Mapped nor Composite CJK: [\u3400-\u4DB5\u4E00-\u9FA5\U00020000-\U0002A6D6]
|
||||
|
||||
public static boolean isCJK(int cp) {
|
||||
return (CJK_BASE <= cp && cp < CJK_LIMIT
|
||||
|| cp == 0xFA0E // compat characters that don't decompose.
|
||||
|| cp == 0xFA0F
|
||||
|| cp == 0xFA11
|
||||
|| cp == 0xFA13
|
||||
|| cp == 0xFA14
|
||||
|| cp == 0xFA1F
|
||||
|| cp == 0xFA21
|
||||
|| cp == 0xFA23
|
||||
|| cp == 0xFA24
|
||||
|| cp == 0xFA27
|
||||
|| cp == 0xFA28
|
||||
|| cp == 0xFA29
|
||||
|| cp == 0xFA2E
|
||||
|| cp == 0xFA2F
|
||||
);
|
||||
}
|
||||
|
||||
public static final int
|
||||
CJK_BASE = 0x4E00,
|
||||
CJK_LIMIT = 0x9FFF+1,
|
||||
CJK_COMPAT_USED_BASE = 0xFA0E,
|
||||
CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
|
||||
CJK_A_BASE = 0x3400,
|
||||
CJK_A_LIMIT = 0x4DBF+1,
|
||||
CJK_B_BASE = 0x20000,
|
||||
CJK_B_LIMIT = 0x2A6DF+1;
|
||||
|
||||
public static final boolean isCJK_AB(int bigChar) {
|
||||
return (CJK_A_BASE <= bigChar && bigChar < CJK_A_LIMIT
|
||||
|| CJK_B_BASE <= bigChar && bigChar < CJK_B_LIMIT);
|
||||
}
|
||||
/*
|
||||
2E80..2EFF; CJK Radicals Supplement
|
||||
2F00..2FDF; Kangxi Radicals
|
||||
|
||||
3400..4DBF; CJK Unified Ideographs Extension A
|
||||
4E00..9FFF; CJK Unified Ideographs
|
||||
F900..FAFF; CJK Compatibility Ideographs
|
||||
|
||||
20000..2A6DF; CJK Unified Ideographs Extension B
|
||||
2F800..2FA1F; CJK Compatibility Ideographs Supplement
|
||||
|
||||
Compat:
|
||||
# F900..FA0D [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
|
||||
# FA10 CJK COMPATIBILITY IDEOGRAPH-FA10
|
||||
# FA12 CJK COMPATIBILITY IDEOGRAPH-FA12
|
||||
# FA15..FA1E [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
|
||||
# FA20 CJK COMPATIBILITY IDEOGRAPH-FA20
|
||||
# FA22 CJK COMPATIBILITY IDEOGRAPH-FA22
|
||||
# FA25..FA26 [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
|
||||
# FA2A..FA2D [4] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA2D
|
||||
# FA30..FA6A [59] CJK COMPATIBILITY IDEOGRAPH-FA30..CJK COMPATIBILITY IDEOGRAPH-FA6A
|
||||
# 2F800..2FA1D [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
|
||||
|
||||
*/
|
||||
|
||||
private final boolean isHangul(int bigChar) {
|
||||
return (0xAC00 <= bigChar && bigChar <= 0xD7A3);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1287,12 +1064,12 @@ Compat:
|
|||
*/
|
||||
private int count1 = 0, count2 = 0, count3 = 0, max2 = 0, max3 = 0;
|
||||
private int oldKey1 = -1, oldKey2 = -1, oldKey3 = -1;
|
||||
Map multiTable = new TreeMap();
|
||||
BitSet found = new BitSet();
|
||||
UnicodeSet found = new UnicodeSet();
|
||||
|
||||
public Hashtable getContracting() {
|
||||
/*public Hashtable getContracting() {
|
||||
return new Hashtable(multiTable);
|
||||
}
|
||||
*/
|
||||
|
||||
public UCAContents getContents(byte ceLimit, Normalizer skipDecomps) {
|
||||
return new UCAContents(ceLimit, skipDecomps, ucdVersion);
|
||||
|
@ -1317,6 +1094,16 @@ Compat:
|
|||
this.ceLimit = ceLimit;
|
||||
this.nfd = new Normalizer(Normalizer.NFD, unicodeVersion);
|
||||
this.skipDecomps = skipDecomps;
|
||||
|
||||
// FIX SAMPLES
|
||||
if (SAMPLE_RANGES[0][0] == 0) {
|
||||
for (int i = 0; ; ++i) { // add first unallocated character
|
||||
if (!ucd.isAssigned(i)) {
|
||||
SAMPLE_RANGES[0][0] = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1334,7 +1121,9 @@ Compat:
|
|||
|
||||
// normal case
|
||||
while (current++ < 0x10FFFF) {
|
||||
|
||||
if (current == 0x406) {
|
||||
System.out.println("DEBUG");
|
||||
}
|
||||
//char ch = (char)current;
|
||||
byte type = getCEType(current);
|
||||
if (type >= ceLimit || type == CONTRACTING_CE) continue;
|
||||
|
@ -1349,15 +1138,18 @@ Compat:
|
|||
}
|
||||
|
||||
// contractions
|
||||
if (enum == null) enum = multiTable.keySet().iterator();
|
||||
if (enum.hasNext()) {
|
||||
if (enum == null) enum = ucaData.getContractions();
|
||||
while (enum.hasNext()) {
|
||||
result = (String)enum.next();
|
||||
if (result.length() == 1 && UTF16.isLeadSurrogate(result.charAt(0))) {
|
||||
//System.out.println("Skipping " + ucd.getCodeAndName(result));
|
||||
continue; // try again
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// extra samples
|
||||
if (currentRange < SAMPLE_RANGES.length) {
|
||||
System.out.println("*");
|
||||
try {
|
||||
result = UTF16.valueOf(itemInRange);
|
||||
} catch (RuntimeException e) {
|
||||
|
@ -1372,10 +1164,11 @@ Compat:
|
|||
endOfRange = SAMPLE_RANGES[currentRange].length > 1
|
||||
? SAMPLE_RANGES[currentRange][1]
|
||||
: startOfRange;
|
||||
skip = ((endOfRange - startOfRange) / 513);
|
||||
//skip = ((endOfRange - startOfRange) / 3);
|
||||
}
|
||||
} else if (itemInRange > startOfRange + 9 && itemInRange < endOfRange - 9 - skip) {
|
||||
itemInRange += skip;
|
||||
} else if (itemInRange > startOfRange + 5 && itemInRange < endOfRange - 5 /* - skip*/) {
|
||||
//itemInRange += skip;
|
||||
itemInRange = endOfRange - 5;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1410,14 +1203,16 @@ Compat:
|
|||
}
|
||||
|
||||
static final int[][] SAMPLE_RANGES = {
|
||||
{0x10000},
|
||||
{0x10FFFF},
|
||||
{0x0220},
|
||||
{0}, // LEAVE EMPTY--Turns into first unassigned character
|
||||
{0xFFF0},
|
||||
{0xD800},
|
||||
{0xDFFF},
|
||||
{0xFFFE},
|
||||
{0xFFFF},
|
||||
{0x10000},
|
||||
{0xC0000},
|
||||
{0xD0000},
|
||||
{0x10FFFF},
|
||||
{0x10FFFE},
|
||||
{0x10FFFF},
|
||||
{0x3400, 0x4DB5},
|
||||
|
@ -1426,7 +1221,7 @@ Compat:
|
|||
{0xA000, 0xA48C},
|
||||
{0xE000, 0xF8FF},
|
||||
{0x20000, 0x2A6D6},
|
||||
{0xE0000, 0xE00FF},
|
||||
{0xE0000, 0xE007E},
|
||||
{0xF0000, 0xF00FD},
|
||||
{0xFFF00, 0xFFFFD},
|
||||
{0x100000, 0x1000FD},
|
||||
|
@ -1438,7 +1233,7 @@ Compat:
|
|||
* Values will override any previous mappings.
|
||||
*/
|
||||
private void addCollationElements(BufferedReader in) throws java.io.IOException {
|
||||
IntStack tempStack = new IntStack(100); // used for reversal
|
||||
IntStack tempStack = new IntStack(100);
|
||||
StringBuffer multiChars = new StringBuffer(); // used for contracting chars
|
||||
String inputLine = "";
|
||||
boolean[] wasImplicitLeadPrimary = new boolean[1];
|
||||
|
@ -1448,6 +1243,10 @@ Compat:
|
|||
if (inputLine == null) break; // means file is done
|
||||
String line = cleanLine(inputLine); // remove comments, extra whitespace
|
||||
if (line.length() == 0) continue; // skip empty lines
|
||||
|
||||
if (DEBUG_SHOW_LINE) {
|
||||
System.out.println("Processing: " + inputLine);
|
||||
}
|
||||
|
||||
position[0] = 0; // start at front of line
|
||||
if (line.startsWith("@version")) {
|
||||
|
@ -1464,29 +1263,21 @@ Compat:
|
|||
}
|
||||
|
||||
// collect characters
|
||||
char value = getChar(line, position);
|
||||
fixSurrogateContraction(value);
|
||||
char value2 = getChar(line, position);
|
||||
multiChars.setLength(0); // clear buffer
|
||||
if (value2 != NOT_A_CHAR) {
|
||||
fixSurrogateContraction(value2);
|
||||
multiChars.append(value); // append until we get terminator
|
||||
|
||||
char value = getChar(line, position);
|
||||
multiChars.append(value);
|
||||
|
||||
//fixSurrogateContraction(value);
|
||||
char value2 = getChar(line, position);
|
||||
// append until we get terminator
|
||||
while (value2 != NOT_A_CHAR) {
|
||||
multiChars.append(value2);
|
||||
while (true) {
|
||||
value2 = getChar(line, position);
|
||||
if (value2 == NOT_A_CHAR) break;
|
||||
fixSurrogateContraction(value2);
|
||||
multiChars.append(value2);
|
||||
}
|
||||
value2 = getChar(line, position);
|
||||
}
|
||||
|
||||
if (RECORDING_CHARS) {
|
||||
if (multiChars.length() > 1) {
|
||||
multiTable.put(multiChars.toString(), "");
|
||||
}
|
||||
found.set(value);
|
||||
for (int i = 1; i < multiChars.length(); ++i) {
|
||||
found.set(multiChars.charAt(i));
|
||||
}
|
||||
found.addAll(multiChars.toString());
|
||||
}
|
||||
if (!fullData && RECORDING_DATA) {
|
||||
if (value == 0 || value == '\t' || value == '\n' || value == '\r'
|
||||
|
@ -1522,141 +1313,69 @@ Compat:
|
|||
}
|
||||
}
|
||||
}
|
||||
if (ce2 != TERMINATOR) { // have expanding character!
|
||||
// put list into the expanding table
|
||||
// use a temporary stack to get them in reverse order
|
||||
tempStack.push(ce);
|
||||
tempStack.push(ce2);
|
||||
// set collationElement to exception value, plus index
|
||||
ce = EXPANDING_MASK | expandingTable.getTop();
|
||||
while (true) {
|
||||
ce2 = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
|
||||
if (ce2 == TERMINATOR) break;
|
||||
tempStack.push(ce2);
|
||||
}
|
||||
// push onto expanding table, now in reverse order
|
||||
while (!tempStack.isEmpty()) expandingTable.push(tempStack.pop());
|
||||
expandingTable.push(TERMINATOR);
|
||||
}
|
||||
|
||||
//if (value == 0xd801) System.out.print("DEBUG: " + line);
|
||||
|
||||
// assign CE(s) to char(s)
|
||||
if (multiChars.length() > 0) {
|
||||
contractingTable.put(multiChars.toString(), new Integer(ce));
|
||||
if (collationElements[value] == UNSUPPORTED) {
|
||||
collationElements[value] = CONTRACTING; // mark special
|
||||
} else if (collationElements[value] != CONTRACTING) {
|
||||
// move old value to contracting table!
|
||||
contractingTable.put(String.valueOf(value), new Integer(collationElements[value]));
|
||||
collationElements[value] = CONTRACTING; // signal we must look up in table
|
||||
}
|
||||
} else if (collationElements[value] == CONTRACTING) {
|
||||
// must add old value to contracting table!
|
||||
contractingTable.put(String.valueOf(value), new Integer(ce));
|
||||
} else {
|
||||
collationElements[value] = ce; // normal
|
||||
}
|
||||
//} catch (Exception e) {
|
||||
// throw new IllegalArgumentException("Malformed line: " + inputLine + "\n "
|
||||
// + e.getClass().getName() + ": " + e.getMessage());
|
||||
tempStack.clear();
|
||||
tempStack.push(ce);
|
||||
|
||||
while (ce2 != TERMINATOR) {
|
||||
tempStack.push(ce2);
|
||||
ce2 = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
|
||||
if (ce2 == TERMINATOR) break;
|
||||
}
|
||||
|
||||
ucaData.add(multiChars, tempStack);
|
||||
|
||||
} catch (RuntimeException e) {
|
||||
System.out.println("Error on line: " + inputLine);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
private void fixSurrogateContraction(char ch) {
|
||||
//if (DEBUGCHAR) System.out.println(Utility.hex(ch) + ": " + line.substring(0, position[0]) + "|" + line.substring(position[0]));
|
||||
if (ch == NOT_A_CHAR || !UTF16.isLeadSurrogate(ch)) return;
|
||||
String chs = String.valueOf(ch);
|
||||
Object probe = contractingTable.get(chs);
|
||||
if (probe != null) return;
|
||||
contractingTable.put(chs, new Integer(0));
|
||||
}
|
||||
|
||||
/*
|
||||
private void concat(int[] ces1, int[] ces2) {
|
||||
|
||||
}
|
||||
|
||||
private void add(String source, int[] ces, int ceLen) {
|
||||
|
||||
int ce;
|
||||
if (ceLen < 1) {
|
||||
throw new IllegalArgumentException("CE too short: " + ceLen);
|
||||
} else if (ceLen == 1) {
|
||||
ce = ces[0];
|
||||
} else {
|
||||
ce = EXPANDING_MASK | expandingTable.getTop();
|
||||
for (int i = 0; i < ceLen; ++i) {
|
||||
expandingTable.push(ces[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// assign CE(s) to char(s)
|
||||
int value = source.charAt(0);
|
||||
//if (value == 0x10000) System.out.print("DEBUG2: " + source);
|
||||
|
||||
if (source.length() > 0) {
|
||||
contractingTable.put(source.toString(), new Integer(ce));
|
||||
if (collationElements[value] == UNSUPPORTED) {
|
||||
collationElements[value] = CONTRACTING; // mark special
|
||||
} else if (collationElements[value] != CONTRACTING) {
|
||||
// move old value to contracting table!
|
||||
contractingTable.put(String.valueOf(value), new Integer(collationElements[value]));
|
||||
collationElements[value] = CONTRACTING; // signal we must look up in table
|
||||
}
|
||||
} else if (collationElements[value] == CONTRACTING) {
|
||||
// must add old value to contracting table!
|
||||
contractingTable.put(source, new Integer(ce));
|
||||
} else {
|
||||
collationElements[source.charAt(0)] = ce; // normal
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Checks the internal tables corresponding to the UCA data.
|
||||
*/
|
||||
private void cleanup() {
|
||||
|
||||
// at this point, we have to guarantee that the contractingTable is CLOSED
|
||||
// e.g. if a substring of length n is in the table, then the first n-1 characters
|
||||
// are also!!
|
||||
ucaData.checkConsistency();
|
||||
|
||||
Map missingStrings = new HashMap();
|
||||
Map tempMap = new HashMap();
|
||||
|
||||
|
||||
/*
|
||||
0FB2 0F71 ; [.124E.0020.0002.0FB2][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER RA + TIBETAN VOWEL SIGN AA
|
||||
0FB3 0F71 ; [.1250.0020.0002.0FB3][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER LA + TIBETAN VOWEL SIGN AA
|
||||
int[] temp1 = int[20];
|
||||
int[] temp2 = int[20];
|
||||
int[] temp3 = int[20];
|
||||
getCEs("\u0fb2", true, temp1);
|
||||
getCEs("\u0fb3", true, temp2);
|
||||
getCEs("\u0f71", true, temp3);
|
||||
add("\u0FB2\u0F71", concat(temp1, temp3));
|
||||
*/
|
||||
|
||||
Hashtable missingStrings = new Hashtable();
|
||||
|
||||
int[] temp1 = new int[20];
|
||||
Enumeration enum = contractingTable.keys();
|
||||
while (enum.hasMoreElements()) {
|
||||
String sequence = (String)enum.nextElement();
|
||||
Iterator enum = ucaData.getContractions();
|
||||
while (enum.hasNext()) {
|
||||
String sequence = (String)enum.next();
|
||||
//System.out.println("Contraction: " + Utility.hex(sequence));
|
||||
for (int i = sequence.length()-1; i > 0; --i) {
|
||||
String shorter = sequence.substring(0,i);
|
||||
Object probe = contractingTable.get(shorter);
|
||||
if (probe == null) {
|
||||
int len = getCEs(shorter, true, temp1);
|
||||
if (false) System.out.println("WARNING: CLOSING: " + UCD.make().getCodeAndName(shorter) + " => " + ceToString(temp1, len));
|
||||
add(shorter, temp1, len);
|
||||
if (!ucaData.contractionTableContains(shorter)) {
|
||||
IntStack tempStack = new IntStack(1);
|
||||
getCEs(shorter, true, tempStack);
|
||||
if (false) System.out.println("WARNING: CLOSING: " + ucd.getCodeAndName(shorter)
|
||||
+ " => " + CEList.toString(tempStack));
|
||||
tempMap.put(shorter, tempStack);
|
||||
// missingStrings.put(shorter,"");
|
||||
// collationElements[sequence.charAt(0)] = UNSUPPORTED; // nuke all bad values
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum = missingStrings.keys();
|
||||
// now add them. We couldn't before because we were iterating over it.
|
||||
|
||||
enum = tempMap.keySet().iterator();
|
||||
while (enum.hasNext()) {
|
||||
String shorter = (String) enum.next();
|
||||
IntStack tempStack = (IntStack) tempMap.get(shorter);
|
||||
ucaData.add(shorter, tempStack);
|
||||
}
|
||||
|
||||
|
||||
enum = missingStrings.keySet().iterator();
|
||||
if (missingStrings.size() != 0) {
|
||||
/**
|
||||
while (enum.hasMoreElements()) {
|
||||
|
@ -1666,26 +1385,30 @@ Compat:
|
|||
}
|
||||
*/
|
||||
String errorMessage = "";
|
||||
while (enum.hasMoreElements()) {
|
||||
String missing = (String)enum.nextElement();
|
||||
while (enum.hasNext()) {
|
||||
String missing = (String)enum.next();
|
||||
if (errorMessage.length() != 0) errorMessage += ", ";
|
||||
errorMessage += "\"" + missing + "\"";
|
||||
}
|
||||
throw new IllegalArgumentException("Contracting table not closed! Missing " + errorMessage);
|
||||
}
|
||||
|
||||
|
||||
//fixlater;
|
||||
variableLowCE = variableLow << 16;
|
||||
variableHighCE = (variableHigh << 16) | 0xFFFF; // turn on bottom bits
|
||||
|
||||
hangulHackBottom = collationElements[0x1100] & 0xFFFF0000; // remove secondaries & tertiaries
|
||||
hangulHackTop = collationElements[0x11F9] | 0xFFFF; // bump up secondaries and tertiaries
|
||||
if (SHOW_STATS) System.out.println("\tHangul Hack: " + Utility.hex(hangulHackBottom) + ", " + Utility.hex(hangulHackTop));
|
||||
//int hangulHackBottom;
|
||||
//int hangulHackTop;
|
||||
|
||||
//hangulHackBottom = collationElements[0x1100] & 0xFFFF0000; // remove secondaries & tertiaries
|
||||
//hangulHackTop = collationElements[0x11F9] | 0xFFFF; // bump up secondaries and tertiaries
|
||||
//if (SHOW_STATS) System.out.println("\tHangul Hack: " + Utility.hex(hangulHackBottom) + ", " + Utility.hex(hangulHackTop));
|
||||
|
||||
// show some statistics
|
||||
if (SHOW_STATS) System.out.println("\tcount1: " + count1);
|
||||
if (SHOW_STATS) System.out.println("\tcount2: " + max2);
|
||||
if (SHOW_STATS) System.out.println("\tcount3: " + max3);
|
||||
if (SHOW_STATS) System.out.println("\tcontractions: " + ucaData.getContractionCount());
|
||||
|
||||
if (SHOW_STATS) System.out.println("\tMIN1/MAX1: " + Utility.hex(MIN1) + "/" + Utility.hex(MAX1));
|
||||
if (SHOW_STATS) System.out.println("\tMIN2/MAX2: " + Utility.hex(MIN2) + "/" + Utility.hex(MAX2));
|
||||
|
@ -1912,7 +1635,7 @@ Compat:
|
|||
/**
|
||||
* Used for checking data file integrity
|
||||
*/
|
||||
private Hashtable uniqueTable = new Hashtable();
|
||||
private Map uniqueTable = new HashMap();
|
||||
|
||||
/**
|
||||
* Used for checking data file integrity
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.18 $
|
||||
* $Date: 2002/06/15 02:47:12 $
|
||||
* $Revision: 1.19 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -31,9 +31,12 @@ import com.ibm.text.UCD.UCD_Types;
|
|||
import com.ibm.text.utility.*;
|
||||
import com.ibm.text.UCD.Normalizer;
|
||||
|
||||
public class WriteCollationData implements UCD_Types {
|
||||
public class WriteCollationData implements UCD_Types, UCA_Types {
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
static final boolean DEBUG_SHOW_ITERATION = true;
|
||||
|
||||
|
||||
|
||||
public static final String copyright =
|
||||
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
|
||||
|
@ -289,7 +292,21 @@ public class WriteCollationData implements UCD_Types {
|
|||
|
||||
|
||||
static void writeConformance(String filename, byte option, boolean shortPrint) throws IOException {
|
||||
UCD ucd30 = UCD.make("3.0.0");
|
||||
//UCD ucd30 = UCD.make("3.0.0");
|
||||
|
||||
/*
|
||||
U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
||||
=> U+00DC LATIN CAPITAL LETTER U WITH DIAERESIS, U+0304 COMBINING MACRON
|
||||
*/
|
||||
String[] testList = {"\uF934", "U", "U\u0308", "\u00DC", "\u00DC\u0304", "U\u0308\u0304"};
|
||||
for (int jj = 0; jj < testList.length; ++jj) {
|
||||
String t = testList[jj];
|
||||
System.out.println(ucd.getCodeAndName(t));
|
||||
String test = collator.getSortKey(t, UCA.NON_IGNORABLE);
|
||||
System.out.println("Decomp: " + collator.toString(test));
|
||||
test = collator.getSortKey(t, UCA.NON_IGNORABLE, false);
|
||||
System.out.println("No Dec: " + collator.toString(test));
|
||||
}
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt", true, false);
|
||||
if (!shortPrint) log.write('\uFEFF');
|
||||
|
@ -297,9 +314,39 @@ public class WriteCollationData implements UCD_Types {
|
|||
System.out.println("Sorting");
|
||||
int counter = 0;
|
||||
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null);
|
||||
cc.enableSamples();
|
||||
UnicodeSet found2 = new UnicodeSet();
|
||||
|
||||
while (true) {
|
||||
String s = cc.next();
|
||||
if (s == null) break;
|
||||
|
||||
found2.addAll(s);
|
||||
|
||||
if (DEBUG_SHOW_ITERATION) {
|
||||
int cp = UTF16.charAt(s, 0);
|
||||
if (cp == 0x220 || !ucd.isAssigned(cp) || ucd.isCJK_BASE(cp)) {
|
||||
System.out.println(ucd.getCodeAndName(s));
|
||||
}
|
||||
}
|
||||
Utility.dot(counter++);
|
||||
if (!ucd.isRepresented(i)) continue;
|
||||
addStringX(s, option);
|
||||
// TODO: add other accents with Cyrillic
|
||||
}
|
||||
|
||||
UnicodeSet found = collator.found;
|
||||
if (!found2.containsAll(found2)) {
|
||||
System.out.println("In both: " + new UnicodeSet(found).retainAll(found2).toPattern(true));
|
||||
System.out.println("In UCA but not iteration: " + new UnicodeSet(found).removeAll(found2).toPattern(true));
|
||||
System.out.println("In iteration but not UCA: " + new UnicodeSet(found2).removeAll(found).toPattern(true));
|
||||
throw new IllegalArgumentException("Inconsistent data");
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
if (!ucd.isAssigned(i)) continue;
|
||||
addStringX(UTF32.valueOf32(i), option);
|
||||
}
|
||||
|
||||
|
@ -318,15 +365,6 @@ public class WriteCollationData implements UCD_Types {
|
|||
addStringX(s, option);
|
||||
}
|
||||
|
||||
for (int i = 0; ; ++i) { // add first unallocated character
|
||||
if (!ucd.isAssigned(i)) {
|
||||
String s = UTF32.valueOf32(i);
|
||||
Utility.fixDot();
|
||||
System.out.println("Adding: " + Utility.hex(s));
|
||||
addStringX(s, option);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < extraConformanceRanges.length; ++i) {
|
||||
|
@ -343,6 +381,7 @@ public class WriteCollationData implements UCD_Types {
|
|||
addStringX(end-1, option);
|
||||
addStringX(end, option);
|
||||
}
|
||||
*/
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("Total: " + sortedD.size());
|
||||
|
@ -364,12 +403,12 @@ public class WriteCollationData implements UCD_Types {
|
|||
//String status = key.equals(lastKey) ? "*" : "";
|
||||
//lastKey = key;
|
||||
//log.println(source);
|
||||
char extra = source.charAt(source.length()-1);
|
||||
String clipped = source.substring(0, source.length()-1);
|
||||
String stren = source.substring(source.length()-1);
|
||||
if (!shortPrint) {
|
||||
log.print(Utility.hex(source));
|
||||
log.print(
|
||||
";\t#" + ucd.getName(clipped) + "\t" + UCA.toString(key));
|
||||
";\t# " + (extra != LOW_ACCENT ? extra : '.') + " " + ucd.getName(clipped, SHORT) + "\t" + UCA.toString(key));
|
||||
} else {
|
||||
log.print(source + "\t" + Utility.hex(clipped));
|
||||
}
|
||||
|
@ -384,13 +423,15 @@ public class WriteCollationData implements UCD_Types {
|
|||
static void addStringX(int x, byte option) {
|
||||
addStringX(UTF32.valueOf32(x), option);
|
||||
}
|
||||
|
||||
static final char LOW_ACCENT = '\u0325';
|
||||
|
||||
static void addStringX(String s, byte option) {
|
||||
addStringY(s + 'a', option);
|
||||
addStringY(s + 'A', option);
|
||||
addStringY(s + 'á', option);
|
||||
addStringY(s + 'b', option);
|
||||
addStringY(s + '\u0325', option);
|
||||
addStringY(s + LOW_ACCENT, option);
|
||||
addStringY(s + '!', option);
|
||||
}
|
||||
|
||||
|
@ -527,7 +568,7 @@ public class WriteCollationData implements UCD_Types {
|
|||
|
||||
if (!arraysMatch(kenCes, kenLen, markCes, markLen)) {
|
||||
int kenCLen = fixCompatibilityCE(s, true, kenComp, true);
|
||||
String comp = collator.ceToString(kenComp, kenCLen);
|
||||
String comp = CEList.toString(kenComp, kenCLen);
|
||||
|
||||
if (arraysMatch(kenCes, kenLen, kenComp, kenCLen)) {
|
||||
forLater.put((char)(COMPRESSED | type) + s, comp);
|
||||
|
@ -567,10 +608,10 @@ public class WriteCollationData implements UCD_Types {
|
|||
String comp = (String)forLater.get(key);
|
||||
|
||||
int kenLen = collator.getCEs(s, decompType, kenCes);
|
||||
String kenStr = collator.ceToString(kenCes, kenLen);
|
||||
String kenStr = CEList.toString(kenCes, kenLen);
|
||||
|
||||
int markLen = fixCompatibilityCE(s, true, markCes, false);
|
||||
String markStr = collator.ceToString(markCes, markLen);
|
||||
String markStr = CEList.toString(markCes, markLen);
|
||||
|
||||
if ((type & COMPRESSED) != 0) {
|
||||
log.println("COMPRESSED #" + (++count) + ": " + ucd.getCodeAndName(s));
|
||||
|
@ -589,7 +630,7 @@ public class WriteCollationData implements UCD_Types {
|
|||
log.println("NFD : " + ucd.getCodeAndName(nfd));
|
||||
}
|
||||
//kenCLen = collator.getCEs(decomp, true, kenComp);
|
||||
//log.println("decomp ce: " + collator.ceToString(kenComp, kenCLen));
|
||||
//log.println("decomp ce: " + CEList.toString(kenComp, kenCLen));
|
||||
}
|
||||
log.println();
|
||||
}
|
||||
|
@ -785,7 +826,7 @@ public class WriteCollationData implements UCD_Types {
|
|||
|
||||
if (s.length() > 1) {
|
||||
diLog.println(Utility.hex(s, " ")
|
||||
+ ";\t #" + collator.ceToString(ces, len)
|
||||
+ ";\t #" + CEList.toString(ces, len)
|
||||
+ " ( " + s + " )"
|
||||
+ " " + ucd.getName(s));
|
||||
}
|
||||
|
@ -859,7 +900,7 @@ public class WriteCollationData implements UCD_Types {
|
|||
ccc = UTF32.char32At(s,kk);
|
||||
byte cat = ucd.getCategory(ccc);
|
||||
if (cat == Cf || cat == Cc || cat == Zs || cat == Zl || cat == Zp) {
|
||||
sortedCodes.add(UCA.ceToString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s));
|
||||
sortedCodes.add(CEList.toString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -882,7 +923,7 @@ public class WriteCollationData implements UCD_Types {
|
|||
if (collator.isVariable(ce)) haveMixture |= 1;
|
||||
else haveMixture |= 2;
|
||||
if (haveMixture == 3) {
|
||||
mixedCEs.add(UCA.ceToString(ces, len) + "\t" + ucd.getCodeAndName(s));
|
||||
mixedCEs.add(CEList.toString(ces, len) + "\t" + ucd.getCodeAndName(s));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1030,7 +1071,7 @@ public class WriteCollationData implements UCD_Types {
|
|||
ccc = UTF32.char32At(s,kk);
|
||||
byte cat = ucd.getCategory(ccc);
|
||||
if (cat == Cf || cat == Cc || cat == Zs || cat == Zl || cat == Zp) {
|
||||
sortedCodes.add(UCA.ceToString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s));
|
||||
sortedCodes.add(CEList.toString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -1053,7 +1094,7 @@ public class WriteCollationData implements UCD_Types {
|
|||
if (collator.isVariable(ce)) haveMixture |= 1;
|
||||
else haveMixture |= 2;
|
||||
if (haveMixture == 3) {
|
||||
mixedCEs.add(UCA.ceToString(ces, len) + "\t" + ucd.getCodeAndName(s));
|
||||
mixedCEs.add(CEList.toString(ces, len) + "\t" + ucd.getCodeAndName(s));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1130,8 +1171,8 @@ public class WriteCollationData implements UCD_Types {
|
|||
+ "\t" + head
|
||||
//+ "\t" + Utility.hex(oldWeight)
|
||||
//+ " => " + Utility.hex(newWeight)
|
||||
+ "\t" + collator.ceToString(ces, len)
|
||||
+ (doNew ? " => " + collator.ceToString(newCes, newLen) : "")
|
||||
+ "\t" + CEList.toString(ces, len)
|
||||
+ (doNew ? " => " + CEList.toString(newCes, newLen) : "")
|
||||
+ "\t( " + src + " )"
|
||||
+ "\t" + ucd.getName(src)
|
||||
);
|
||||
|
@ -1198,7 +1239,7 @@ public class WriteCollationData implements UCD_Types {
|
|||
|
||||
if (false) {
|
||||
int len2 = collator.getCEs("\u2474", true, ces);
|
||||
System.out.println(UCA.ceToString(ces, len2));
|
||||
System.out.println(CEList.toString(ces, len2));
|
||||
|
||||
String a = collator.getSortKey("a");
|
||||
String b = collator.getSortKey("A");
|
||||
|
@ -1442,9 +1483,9 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
|
||||
|
||||
if (false) System.out.println(
|
||||
collator.ceToString(lastCE) + " "
|
||||
+ collator.ceToString(ce) + " "
|
||||
+ collator.ceToString(nextCE) + " "
|
||||
CEList.toString(lastCE) + " "
|
||||
+ CEList.toString(ce) + " "
|
||||
+ CEList.toString(nextCE) + " "
|
||||
+ ucd.getCodeAndName(chr)
|
||||
);
|
||||
|
||||
|
@ -1513,7 +1554,7 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
*/
|
||||
|
||||
if (chr.equals("\u2F00")) {
|
||||
System.out.println(UCA.ceToString(ces, len));
|
||||
System.out.println(CEList.toString(ces, len));
|
||||
}
|
||||
|
||||
// There are double-CEs, so we have to know what the length of the first bit is.
|
||||
|
@ -1561,7 +1602,7 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
if (expansion.length() > 0) log.print(" / " + quoteOperand(expansion));
|
||||
if (option == WITH_NAMES) {
|
||||
log.print("\t# "
|
||||
+ collator.ceToString(ces, len) + " "
|
||||
+ CEList.toString(ces, len) + " "
|
||||
+ ucd.getCodeAndName(chr));
|
||||
if (expansion.length() > 0) log.print(" / " + Utility.hex(expansion));
|
||||
}
|
||||
|
@ -1801,7 +1842,7 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
|
||||
// we failed completely. Print error message, and bail
|
||||
|
||||
System.out.println("No back map for " + collator.ceToString(ces[i])
|
||||
System.out.println("No back map for " + CEList.toString(ces[i])
|
||||
+ " from " + CEList.toString(ces, len));
|
||||
System.out.println("\t" + ucd.getCodeAndName(chr)
|
||||
+ " => " + ucd.getCodeAndName(nfkdNew.normalize(chr))
|
||||
|
@ -2126,6 +2167,7 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
continue;
|
||||
}
|
||||
canIt.setSource(key);
|
||||
|
||||
boolean first = true;
|
||||
while (true) {
|
||||
String s = canIt.next();
|
||||
|
@ -2134,9 +2176,6 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
if (contentsForCanonicalIteration.contains(s)) continue;
|
||||
if (additionalSet.contains(s)) continue;
|
||||
|
||||
if (s.equals("\u01EC")) {
|
||||
System.out.println("01ec");
|
||||
}
|
||||
|
||||
// Skip anything that is not FCD.
|
||||
if (!NFD.isFCD(s)) continue;
|
||||
|
@ -2234,7 +2273,7 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
log.println("# - Differs from previous version in that MAX value was introduced at 1F.");
|
||||
log.println("# All tertiary values are shifted down by 1, filling the gap at 7!");
|
||||
|
||||
int firstImplicit = getImplicitPrimary(UCA.CJK_BASE) >>> 24;
|
||||
int firstImplicit = getImplicitPrimary(CJK_BASE) >>> 24;
|
||||
int lastImplicit = getImplicitPrimary(0x10FFFF) >>> 24;
|
||||
log.println("[FIRST_IMPLICIT= " + Utility.hex(firstImplicit) + "]");
|
||||
log.println("[LAST_IMPLICIT= " + Utility.hex(lastImplicit) + "]");
|
||||
|
@ -2285,13 +2324,15 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
int sec = UCA.getSecondary(ces[q]);
|
||||
int ter = UCA.getTertiary(ces[q]);
|
||||
|
||||
oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16);
|
||||
oldStr.append(CEList.toString(ces[q]));// + "," + Integer.toString(ces[q],16);
|
||||
|
||||
// special treatment for unsupported!
|
||||
|
||||
if (UCA.isImplicitLeadPrimary(pri)) {
|
||||
System.out.println("DEBUG: " + CEList.toString(ces, len)
|
||||
+ ", Current: " + q + ", " + ucd.getCodeAndName(chr));
|
||||
++q;
|
||||
oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16);
|
||||
oldStr.append(CEList.toString(ces[q]));// + "," + Integer.toString(ces[q],16);
|
||||
|
||||
int pri2 = UCA.getPrimary(ces[q]);
|
||||
// get old code point
|
||||
|
@ -2301,7 +2342,7 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
// double check results!
|
||||
|
||||
int[] testImplicit = new int[2];
|
||||
UCA.CodepointToImplicit(cp, testImplicit);
|
||||
collator.CodepointToImplicit(cp, testImplicit);
|
||||
boolean gotError = pri != testImplicit[0] || pri2 != testImplicit[1];
|
||||
if (gotError) {
|
||||
System.out.println("ERROR");
|
||||
|
@ -2360,7 +2401,7 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
}
|
||||
if (nonePrinted) {
|
||||
log.print("[,,]");
|
||||
oldStr.append(UCA.ceToString(0));
|
||||
oldStr.append(CEList.toString(0));
|
||||
}
|
||||
longLog.print(" # " + oldStr + " # " + ucd.getName(UTF16.charAt(chr, 0)));
|
||||
log.println();
|
||||
|
@ -2386,7 +2427,7 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
|
||||
boolean lastOne = false;
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
boolean thisOne = UCA.isCJK(i) || UCA.isCJK_AB(i);
|
||||
boolean thisOne = ucd.isCJK_BASE(i) || ucd.isCJK_AB(i);
|
||||
if (thisOne != lastOne) {
|
||||
summary.println("# Implicit Cusp: CJK=" + lastOne + ": " + Utility.hex(i-1) + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(i-1)));
|
||||
summary.println("# Implicit Cusp: CJK=" + thisOne + ": " + Utility.hex(i) + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(i)));
|
||||
|
@ -2425,7 +2466,7 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
summary.print("# " + Utility.hex(i) + ": (" + Utility.hex(newval) + ") "
|
||||
+ Utility.hex(sampleEq[i]) + " ");
|
||||
for (int q = 0; q < len; ++q) {
|
||||
summary.print(UCA.ceToString(ces[q]));
|
||||
summary.print(CEList.toString(ces[q]));
|
||||
}
|
||||
summary.println(" " + ucd.getName(sampleEq[i]));
|
||||
}
|
||||
|
@ -2499,24 +2540,24 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
*/
|
||||
static int swapCJK(int i) {
|
||||
|
||||
if (i >= UCA.CJK_BASE) {
|
||||
if (i < UCA.CJK_LIMIT) return i - UCA.CJK_BASE;
|
||||
if (i >= CJK_BASE) {
|
||||
if (i < CJK_LIMIT) return i - CJK_BASE;
|
||||
|
||||
if (i < UCA.CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
|
||||
if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
|
||||
|
||||
if (i < UCA.CJK_COMPAT_USED_LIMIT) return i - UCA.CJK_COMPAT_USED_BASE
|
||||
+ (UCA.CJK_LIMIT - UCA.CJK_BASE);
|
||||
if (i < UCA.CJK_B_BASE) return i + NON_CJK_OFFSET;
|
||||
if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
|
||||
+ (CJK_LIMIT - CJK_BASE);
|
||||
if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
|
||||
|
||||
if (i < UCA.CJK_B_LIMIT) return i; // non-BMP-CJK
|
||||
if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
|
||||
|
||||
return i + NON_CJK_OFFSET; // non-CJK
|
||||
}
|
||||
if (i < UCA.CJK_A_BASE) return i + NON_CJK_OFFSET;
|
||||
if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
|
||||
|
||||
if (i < UCA.CJK_A_LIMIT) return i - UCA.CJK_A_BASE
|
||||
+ (UCA.CJK_LIMIT - UCA.CJK_BASE)
|
||||
+ (UCA.CJK_COMPAT_USED_LIMIT - UCA.CJK_COMPAT_USED_BASE);
|
||||
if (i < CJK_A_LIMIT) return i - CJK_A_BASE
|
||||
+ (CJK_LIMIT - CJK_BASE)
|
||||
+ (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
|
||||
return i + NON_CJK_OFFSET; // non-CJK
|
||||
}
|
||||
|
||||
|
@ -2642,14 +2683,14 @@ static int swapCJK(int i) {
|
|||
oldPrimary = newPrimary;
|
||||
}
|
||||
|
||||
showImplicit("# First CJK", UCA.CJK_BASE);
|
||||
showImplicit("# Last CJK", UCA.CJK_LIMIT-1);
|
||||
showImplicit("# First CJK-compat", UCA.CJK_COMPAT_USED_BASE);
|
||||
showImplicit("# Last CJK-compat", UCA.CJK_COMPAT_USED_LIMIT-1);
|
||||
showImplicit("# First CJK_A", UCA.CJK_A_BASE);
|
||||
showImplicit("# Last CJK_A", UCA.CJK_A_LIMIT-1);
|
||||
showImplicit("# First CJK_B", UCA.CJK_B_BASE);
|
||||
showImplicit("# Last CJK_B", UCA.CJK_B_LIMIT-1);
|
||||
showImplicit("# First CJK", CJK_BASE);
|
||||
showImplicit("# Last CJK", CJK_LIMIT-1);
|
||||
showImplicit("# First CJK-compat", CJK_COMPAT_USED_BASE);
|
||||
showImplicit("# Last CJK-compat", CJK_COMPAT_USED_LIMIT-1);
|
||||
showImplicit("# First CJK_A", CJK_A_BASE);
|
||||
showImplicit("# Last CJK_A", CJK_A_LIMIT-1);
|
||||
showImplicit("# First CJK_B", CJK_B_BASE);
|
||||
showImplicit("# Last CJK_B", CJK_B_LIMIT-1);
|
||||
showImplicit("# First Other Implicit", 0);
|
||||
showImplicit("# Last Other Implicit", 0x10FFFF);
|
||||
|
||||
|
@ -2667,9 +2708,9 @@ static int swapCJK(int i) {
|
|||
|
||||
// separate the three groups
|
||||
|
||||
if (UCA.isCJK(i) || UCA.CJK_COMPAT_USED_BASE <= i && i < UCA.CJK_COMPAT_USED_LIMIT) {
|
||||
if (ucd.isCJK_BASE(i) || CJK_COMPAT_USED_BASE <= i && i < CJK_COMPAT_USED_LIMIT) {
|
||||
if (batch != 0) continue;
|
||||
} else if (UCA.isCJK_AB(i)) {
|
||||
} else if (ucd.isCJK_AB(i)) {
|
||||
if (batch != 1) continue;
|
||||
} else if (batch != 2) continue;
|
||||
|
||||
|
@ -2993,7 +3034,7 @@ static int swapCJK(int i) {
|
|||
|
||||
for (char ch = 0; ch < 0xFFFF; ++ch) {
|
||||
byte type = collator.getCEType(ch);
|
||||
if (type < UCA.FIXED_CE) {
|
||||
if (type < FIXED_CE) {
|
||||
int len = collator.getCEs(String.valueOf(ch), true, ces);
|
||||
int primary = UCA.getPrimary(ces[0]);
|
||||
if (primary < variableHigh) continue;
|
||||
|
@ -3088,36 +3129,22 @@ static int swapCJK(int i) {
|
|||
System.out.println("Sorting");
|
||||
|
||||
for (int i = 0; i <= 0xFFFF; ++i) {
|
||||
if (EXCLUDE_UNSUPPORTED && !collator.found.get(i)) continue;
|
||||
if (EXCLUDE_UNSUPPORTED && !collator.found.contains(i)) continue;
|
||||
if (0xD800 <= i && i <= 0xF8FF) continue; // skip surrogates and private use
|
||||
//if (0xA000 <= c && c <= 0xA48F) continue; // skip YI
|
||||
addString(UTF32.valueOf32(i), option);
|
||||
}
|
||||
|
||||
Hashtable multiTable = collator.getContracting();
|
||||
Enumeration enum = multiTable.keys();
|
||||
while (enum.hasMoreElements()) {
|
||||
addString((String)enum.nextElement(), option);
|
||||
}
|
||||
|
||||
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null);
|
||||
cc.enableSamples();
|
||||
|
||||
for (int i = 0; i < extraConformanceTests.length; ++i) { // put in sample non-characters
|
||||
addString(extraConformanceTests[i], option);
|
||||
while (true) {
|
||||
String s = cc.next();
|
||||
if (s == null) break;
|
||||
addString(s, option);
|
||||
}
|
||||
|
||||
for (int i = 0; i < extraConformanceRanges.length; ++i) {
|
||||
int start = extraConformanceRanges[i][0];
|
||||
int end = extraConformanceRanges[i][1];
|
||||
int increment = ((end - start + 1) / 303) + 1;
|
||||
//System.out.println("Range: " + start + ", " + end + ", " + increment);
|
||||
addString(start, option);
|
||||
for (int j = start+1; j < end-1; j += increment) {
|
||||
addString(j, option);
|
||||
addString(j+1, option);
|
||||
}
|
||||
addString(end-1, option);
|
||||
addString(end, option);
|
||||
}
|
||||
|
||||
|
||||
System.out.println("Total: " + sortedD.size());
|
||||
Iterator it;
|
||||
|
||||
|
|
|
@ -5,12 +5,14 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java,v $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2002/06/15 02:47:12 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
WARNING: OLD FILE. DON"T COMPILE.
|
||||
|
||||
package com.ibm.text.UCA;
|
||||
|
||||
import java.util.*;
|
||||
|
@ -21,6 +23,7 @@ import com.ibm.text.UCD.*;
|
|||
import com.ibm.text.utility.*;
|
||||
|
||||
public class WriteHTMLCollation implements UCD_Types {
|
||||
|
||||
public static final String copyright =
|
||||
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
|
||||
|
||||
|
@ -74,8 +77,8 @@ public class WriteHTMLCollation implements UCD_Types {
|
|||
*/
|
||||
|
||||
// DO FOLLOWING
|
||||
writeConformance("CollationTest_NON_IGNORABLE.txt", UCA.NON_IGNORABLE);
|
||||
writeConformance("CollationTest_SHIFTED.txt", UCA.SHIFTED);
|
||||
//writeConformance("CollationTest_NON_IGNORABLE.txt", UCA.NON_IGNORABLE);
|
||||
//writeConformance("CollationTest_SHIFTED.txt", UCA.SHIFTED);
|
||||
|
||||
// SKIP BELOW
|
||||
if (true) return;
|
||||
|
@ -178,7 +181,7 @@ public class WriteHTMLCollation implements UCD_Types {
|
|||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/*
|
||||
static void writeConformance(String filename, byte option) throws IOException {
|
||||
PrintWriter log = Utility.openPrintWriter(filename);
|
||||
|
||||
|
@ -193,6 +196,7 @@ public class WriteHTMLCollation implements UCD_Types {
|
|||
addStringX(c, option);
|
||||
}
|
||||
|
||||
|
||||
Hashtable multiTable = collator.getContracting();
|
||||
Enumeration enum = multiTable.keys();
|
||||
while (enum.hasMoreElements()) {
|
||||
|
@ -248,7 +252,8 @@ public class WriteHTMLCollation implements UCD_Types {
|
|||
sortedD.clear();
|
||||
System.out.println("Done");
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
static void addStringX(int x, byte option) {
|
||||
addStringX(String.valueOf((char)x), option);
|
||||
}
|
||||
|
@ -382,7 +387,7 @@ public class WriteHTMLCollation implements UCD_Types {
|
|||
|
||||
if (!arraysMatch(kenCes, kenLen, markCes, markLen)) {
|
||||
int kenCLen = fixCompatibilityCE(s, true, kenComp, true);
|
||||
String comp = collator.ceToString(kenComp, kenCLen);
|
||||
String comp = CEList.toString(kenComp, kenCLen);
|
||||
|
||||
if (arraysMatch(kenCes, kenLen, kenComp, kenCLen)) {
|
||||
forLater.put((char)(COMPRESSED | type) + s, comp);
|
||||
|
@ -422,10 +427,10 @@ public class WriteHTMLCollation implements UCD_Types {
|
|||
String comp = (String)forLater.get(key);
|
||||
|
||||
int kenLen = collator.getCEs(s, decompType, kenCes);
|
||||
String kenStr = collator.ceToString(kenCes, kenLen);
|
||||
String kenStr = CEList.toString(kenCes, kenLen);
|
||||
|
||||
int markLen = fixCompatibilityCE(s, true, markCes, false);
|
||||
String markStr = collator.ceToString(markCes, markLen);
|
||||
String markStr = CEList.toString(markCes, markLen);
|
||||
|
||||
if ((type & COMPRESSED) != 0) {
|
||||
log.println("COMPRESSED #" + (++count) + ": " + ucd.getCodeAndName(s));
|
||||
|
@ -444,7 +449,7 @@ public class WriteHTMLCollation implements UCD_Types {
|
|||
log.println("NFD : " + ucd.getCodeAndName(nfdstr));
|
||||
}
|
||||
//kenCLen = collator.getCEs(decomp, true, kenComp);
|
||||
//log.println("decomp ce: " + collator.ceToString(kenComp, kenCLen));
|
||||
//log.println("decomp ce: " + CEList.toString(kenComp, kenCLen));
|
||||
}
|
||||
log.println();
|
||||
}
|
||||
|
@ -569,7 +574,7 @@ public class WriteHTMLCollation implements UCD_Types {
|
|||
|
||||
{
|
||||
int len2 = collator.getCEs("\u2474", true, ces);
|
||||
System.out.println(UCA.ceToString(ces, len2));
|
||||
System.out.println(CEList.toString(ces, len2));
|
||||
|
||||
String a = collator.getSortKey("a");
|
||||
String b = collator.getSortKey("A");
|
||||
|
@ -640,7 +645,7 @@ public class WriteHTMLCollation implements UCD_Types {
|
|||
else if (collator.getTertiary(ce) != collator.getTertiary(lastCE)) relation = " <<<";
|
||||
lastCE = ce;
|
||||
if (chr.equals("\u2474")) {
|
||||
System.out.println(UCA.ceToString(ces, len));
|
||||
System.out.println(CEList.toString(ces, len));
|
||||
}
|
||||
|
||||
// check expansions
|
||||
|
@ -653,7 +658,7 @@ public class WriteHTMLCollation implements UCD_Types {
|
|||
int probe = ces[i];
|
||||
String s = getFromBackMap(backMap, probe);
|
||||
if (s == null) {
|
||||
System.out.println("No back map for " + collator.ceToString(ces[i])
|
||||
System.out.println("No back map for " + CEList.toString(ces[i])
|
||||
+ ": " + ucd.getCodeAndName(chr));
|
||||
expansion += "[" + Utility.hex(ces[i]) + "]";
|
||||
} else {
|
||||
|
@ -943,7 +948,7 @@ public class WriteHTMLCollation implements UCD_Types {
|
|||
}
|
||||
if (sampleEq[sec] == null) sampleEq[sec] = chr;
|
||||
if (sampleEq[ter] == null) sampleEq[ter] = chr;
|
||||
oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16);
|
||||
oldStr.append(CEList.toString(ces[q]));// + "," + Integer.toString(ces[q],16);
|
||||
int np = primaryDelta[UCA.getPrimary(ces[q])];
|
||||
hexBytes(np, newPrimary);
|
||||
hexBytes(fixSecondary(UCA.getSecondary(ces[q])), newSecondary);
|
||||
|
@ -968,7 +973,7 @@ public class WriteHTMLCollation implements UCD_Types {
|
|||
}
|
||||
if (nonePrinted) {
|
||||
log.print("[,,]");
|
||||
oldStr.append(UCA.ceToString(0));
|
||||
oldStr.append(CEList.toString(0));
|
||||
}
|
||||
log.println(" # " + oldStr + " # " + ucd.getName(chr.charAt(0)));
|
||||
lastChr = chr;
|
||||
|
@ -1017,7 +1022,7 @@ public class WriteHTMLCollation implements UCD_Types {
|
|||
summary.print("# " + Utility.hex(i) + ": (" + Utility.hex(newval) + ") "
|
||||
+ Utility.hex(sampleEq[i]) + " ");
|
||||
for (int q = 0; q < len; ++q) {
|
||||
summary.print(UCA.ceToString(ces[q]));
|
||||
summary.print(CEList.toString(ces[q]));
|
||||
}
|
||||
summary.println(" " + ucd.getName(sampleEq[i]));
|
||||
}
|
||||
|
@ -1438,7 +1443,7 @@ public class WriteHTMLCollation implements UCD_Types {
|
|||
|
||||
for (int i = 0; i <= 0xFFFF; ++i) {
|
||||
char c = (char)i;
|
||||
if (EXCLUDE_UNSUPPORTED && !collator.found.get(c)) continue;
|
||||
if (EXCLUDE_UNSUPPORTED && !collator.found.contains(c)) continue;
|
||||
if (0xD800 <= i && i <= 0xF8FF) continue; // skip surrogates and private use
|
||||
//if (0xA000 <= c && c <= 0xA48F) continue; // skip YI
|
||||
addString(String.valueOf(c), option);
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.15 $
|
||||
* $Date: 2002/06/15 02:47:14 $
|
||||
* $Revision: 1.16 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -63,6 +63,7 @@ public final class Main implements UCD_Types {
|
|||
else if (arg.equalsIgnoreCase("diffIgnorable")) VerifyUCD.diffIgnorable();
|
||||
else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
|
||||
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
|
||||
else if (arg.equalsIgnoreCase("onetime")) VerifyUCD.oneTime();
|
||||
else if (arg.equalsIgnoreCase("verifyNormalizationStability")) VerifyUCD.verifyNormalizationStability();
|
||||
|
||||
else if (arg.equalsIgnoreCase("definitionTransliterator")) GenerateHanTransliterator.main(0);
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.13 $
|
||||
* $Date: 2002/06/15 02:47:13 $
|
||||
* $Revision: 1.14 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -146,7 +146,7 @@ public final class UCD implements UCD_Types {
|
|||
* Get the character names for the code points in a string, separated by ", "
|
||||
*/
|
||||
public String getName(String s, byte style) {
|
||||
if (s.length() == 1) return get(s.charAt(0), true).name;
|
||||
if (s.length() == 1) return getName(s.charAt(0), style);
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
|
@ -182,15 +182,15 @@ public final class UCD implements UCD_Types {
|
|||
/**
|
||||
* Get the name and number (U+xxxx NAME) for a code point
|
||||
*/
|
||||
public String getCodeAndName(int codePoint) {
|
||||
return getCode(codePoint) + " " + getName(codePoint);
|
||||
public String getCodeAndName(int codePoint, byte type) {
|
||||
return getCode(codePoint) + " " + getName(codePoint, type);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the name and number (U+xxxx NAME) for the code points in a string,
|
||||
* separated by ", "
|
||||
*/
|
||||
public String getCodeAndName(String s) {
|
||||
public String getCodeAndName(String s, byte type) {
|
||||
if (s == null || s.length() == 0) return "NULL";
|
||||
if (s.length() == 1) return getCodeAndName(s.charAt(0)); // fast path
|
||||
StringBuffer result = new StringBuffer();
|
||||
|
@ -203,6 +203,20 @@ public final class UCD implements UCD_Types {
|
|||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the name and number (U+xxxx NAME) for a code point
|
||||
*/
|
||||
public String getCodeAndName(int codePoint) {
|
||||
return getCodeAndName(codePoint, NORMAL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the name and number (U+xxxx NAME) for a code point
|
||||
*/
|
||||
public String getCodeAndName(String s) {
|
||||
return getCodeAndName(s, NORMAL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the general category
|
||||
*/
|
||||
|
@ -990,10 +1004,20 @@ to guarantee identifier closure.
|
|||
result = getRaw(codePoint);
|
||||
if (result == null) {
|
||||
result = UData.UNASSIGNED;
|
||||
if (fixStrings) result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
|
||||
result.name = null; // clean this up, since we reuse UNASSIGNED
|
||||
result.shortName = null;
|
||||
if (fixStrings) {
|
||||
result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
|
||||
}
|
||||
}
|
||||
if (result.shortName != null && result.shortName.length() == 0) {
|
||||
result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
|
||||
if (fixStrings) {
|
||||
if (result.name == null) {
|
||||
result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
|
||||
System.out.println("Warning: fixing name for " + result.name);
|
||||
}
|
||||
if (result.shortName == null) {
|
||||
result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
case 0x3400: // CJK Ideograph Extension A
|
||||
|
@ -1024,6 +1048,8 @@ to guarantee identifier closure.
|
|||
result = getRaw(rangeStart);
|
||||
if (result == null) {
|
||||
result = UData.UNASSIGNED;
|
||||
result.name = null; // clean this up, since we reuse UNASSIGNED
|
||||
result.shortName = null;
|
||||
if (fixStrings) {
|
||||
result.name = "<reserved-" + Utility.hex(codePoint, 4) + ">";
|
||||
result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
|
||||
|
@ -1047,6 +1073,32 @@ to guarantee identifier closure.
|
|||
return result;
|
||||
}
|
||||
|
||||
// Neither Mapped nor Composite CJK: [\u3400-\u4DB5\u4E00-\u9FA5\U00020000-\U0002A6D6]
|
||||
|
||||
public static final boolean isCJK_AB(int bigChar) {
|
||||
return (CJK_A_BASE <= bigChar && bigChar < CJK_A_LIMIT
|
||||
|| CJK_B_BASE <= bigChar && bigChar < CJK_B_LIMIT);
|
||||
}
|
||||
|
||||
public static boolean isCJK_BASE(int cp) {
|
||||
return (CJK_BASE <= cp && cp < CJK_LIMIT
|
||||
|| cp == 0xFA0E // compat characters that don't decompose.
|
||||
|| cp == 0xFA0F
|
||||
|| cp == 0xFA11
|
||||
|| cp == 0xFA13
|
||||
|| cp == 0xFA14
|
||||
|| cp == 0xFA1F
|
||||
|| cp == 0xFA21
|
||||
|| cp == 0xFA23
|
||||
|| cp == 0xFA24
|
||||
|| cp == 0xFA27
|
||||
|| cp == 0xFA28
|
||||
|| cp == 0xFA29
|
||||
|| cp == 0xFA2E
|
||||
|| cp == 0xFA2F
|
||||
);
|
||||
}
|
||||
|
||||
// Hangul constants
|
||||
|
||||
public static final int
|
||||
|
@ -1108,7 +1160,7 @@ to guarantee identifier closure.
|
|||
return 0xFFFF; // no composition
|
||||
}
|
||||
|
||||
static boolean isHangulSyllable(int char1) {
|
||||
static public boolean isHangulSyllable(int char1) {
|
||||
return SBase <= char1 && char1 < SLimit;
|
||||
}
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
|
||||
* $Date: 2002/05/29 02:01:00 $
|
||||
* $Revision: 1.12 $
|
||||
* $Date: 2002/06/15 02:47:13 $
|
||||
* $Revision: 1.13 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -21,8 +21,17 @@ public interface UCD_Types {
|
|||
public static final String UCD_DIR = BASE_DIR + "UCD\\";
|
||||
public static final String BIN_DIR = BASE_DIR + "BIN\\";
|
||||
public static final String GEN_DIR = BASE_DIR + "GEN\\";
|
||||
|
||||
|
||||
|
||||
public static final int
|
||||
CJK_BASE = 0x4E00,
|
||||
CJK_LIMIT = 0x9FFF+1,
|
||||
CJK_COMPAT_USED_BASE = 0xFA0E,
|
||||
CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
|
||||
CJK_A_BASE = 0x3400,
|
||||
CJK_A_LIMIT = 0x4DBF+1,
|
||||
CJK_B_BASE = 0x20000,
|
||||
CJK_B_LIMIT = 0x2A6DF+1;
|
||||
|
||||
static final byte BINARY_FORMAT = 6; // bumped if binary format of UCD changes
|
||||
|
||||
// Unicode Property Types
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2002/06/15 02:47:12 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -18,7 +18,7 @@ import com.ibm.text.utility.*;
|
|||
|
||||
class UData implements UCD_Types {
|
||||
String name;
|
||||
String shortName = ""; // cache
|
||||
String shortName; // cache
|
||||
String decompositionMapping;
|
||||
String simpleUppercase;
|
||||
String simpleLowercase;
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.15 $
|
||||
* $Date: 2002/06/15 02:47:12 $
|
||||
* $Revision: 1.16 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -27,6 +27,27 @@ import com.ibm.text.utility.*;
|
|||
import java.text.NumberFormat;
|
||||
|
||||
public class VerifyUCD implements UCD_Types {
|
||||
|
||||
static void oneTime() {
|
||||
Default.setUCD();
|
||||
int[] testSet = {0x10000, 'a', 0xE0000, '\u0221'}; // 10000
|
||||
for (int i = 0; i < testSet.length; ++i) {
|
||||
int item = testSet[i];
|
||||
System.out.println(Default.ucd.getCode(item));
|
||||
|
||||
boolean ass = Default.ucd.isAssigned(item);
|
||||
System.out.println(ass ? " assigned" : " unassigned");
|
||||
ass = Default.ucd.isAllocated(item);
|
||||
System.out.println(ass ? " allocated" : " unallocated");
|
||||
|
||||
String name = Default.ucd.getName(item, SHORT);
|
||||
System.out.println(" " + name);
|
||||
name = Default.ucd.getName(item);
|
||||
System.out.println(" " + name);
|
||||
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
|
||||
static final byte NC = UNUSED_CATEGORY;
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/IntStack.java,v $
|
||||
* $Date: 2001/09/19 23:33:52 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2002/06/15 02:47:14 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -17,30 +17,65 @@ package com.ibm.text.utility;
|
|||
// Simple stack mechanism, with push, pop and access
|
||||
// =============================================================
|
||||
|
||||
public final class IntStack implements Comparable {
|
||||
public final class IntStack implements Comparable, Cloneable {
|
||||
private int[] values;
|
||||
private int top = 0;
|
||||
private int first = 0;
|
||||
|
||||
public IntStack(int initialSize) {
|
||||
values = new int[initialSize];
|
||||
}
|
||||
|
||||
public IntStack append(IntStack other) {
|
||||
// TODO speed up by copying arrays
|
||||
for (int i = 0; i < other.getTop(); ++i) {
|
||||
push(other.get(i));
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public void push(int value) {
|
||||
public IntStack append(int value) {
|
||||
return push(value);
|
||||
}
|
||||
|
||||
public int length() {
|
||||
return top - first;
|
||||
}
|
||||
|
||||
public IntStack push(int value) {
|
||||
if (top >= values.length) { // must grow?
|
||||
int[] temp = new int[values.length*2];
|
||||
System.arraycopy(values,0,temp,0,values.length);
|
||||
values = temp;
|
||||
}
|
||||
values[top++] = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
public int pop() {
|
||||
if (top > 0) return values[--top];
|
||||
if (top > first) {
|
||||
int result = values[--top];
|
||||
if (top == first && first > 0) {
|
||||
top = first = 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
throw new IllegalArgumentException("Stack underflow");
|
||||
}
|
||||
|
||||
public int popFront() {
|
||||
if (top > first) {
|
||||
int result = values[first++];
|
||||
if (top == first) {
|
||||
top = first = 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
throw new IllegalArgumentException("Stack underflow");
|
||||
}
|
||||
|
||||
public int get(int index) {
|
||||
if (0 <= index && index < top) return values[index];
|
||||
if (first <= index && index < top) return values[index];
|
||||
throw new IllegalArgumentException("Stack index out of bounds");
|
||||
}
|
||||
|
||||
|
@ -49,22 +84,24 @@ public final class IntStack implements Comparable {
|
|||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return top == 0;
|
||||
return top - first == 0;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
top = 0;
|
||||
top = first = 0;
|
||||
}
|
||||
|
||||
public int compareTo(Object other) {
|
||||
IntStack that = (IntStack) other;
|
||||
int min = top;
|
||||
if (min < that.top) min = that.top;
|
||||
for (int i = 0; i < min; ++i) {
|
||||
int result = values[i] - that.values[i];
|
||||
int myLen = top - first;
|
||||
int thatLen = that.top - that.first;
|
||||
int limit = first + ((myLen < thatLen) ? myLen : thatLen);
|
||||
int delta = that.first - first;
|
||||
for (int i = first; i < limit; ++i) {
|
||||
int result = values[i] - that.values[i + delta];
|
||||
if (result != 0) return result;
|
||||
}
|
||||
return top - that.top;
|
||||
return myLen - thatLen;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
|
@ -73,9 +110,19 @@ public final class IntStack implements Comparable {
|
|||
|
||||
public int hashCode() {
|
||||
int result = top;
|
||||
for (int i = 0; i < top; ++i) {
|
||||
for (int i = first; i < top; ++i) {
|
||||
result = result * 37 + values[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
try {
|
||||
IntStack result = (IntStack) (super.clone());
|
||||
result.values = (int[]) result.values.clone();
|
||||
return result;
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new IllegalArgumentException("Will never happen");
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Reference in a new issue