From 3940ed8c005493e92e020681b2d811443bac3a53 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Sat, 15 Jun 2002 02:47:14 +0000 Subject: [PATCH] finally did some significant code cleanup on collation. not enough, but it's a start X-SVN-Rev: 8896 --- .../unicodetools/com/ibm/text/UCA/CEList.java | 13 +- tools/unicodetools/com/ibm/text/UCA/Main.java | 7 +- tools/unicodetools/com/ibm/text/UCA/UCA.java | 681 ++++++------------ .../com/ibm/text/UCA/WriteCollationData.java | 209 +++--- .../com/ibm/text/UCA/WriteHTMLCollation.java | 39 +- tools/unicodetools/com/ibm/text/UCD/Main.java | 5 +- tools/unicodetools/com/ibm/text/UCD/UCD.java | 72 +- .../com/ibm/text/UCD/UCD_Types.java | 17 +- .../unicodetools/com/ibm/text/UCD/UData.java | 6 +- .../com/ibm/text/UCD/VerifyUCD.java | 25 +- .../com/ibm/text/utility/IntStack.java | 75 +- 11 files changed, 522 insertions(+), 627 deletions(-) diff --git a/tools/unicodetools/com/ibm/text/UCA/CEList.java b/tools/unicodetools/com/ibm/text/UCA/CEList.java index 0e6ae21874b..81def32e15b 100644 --- a/tools/unicodetools/com/ibm/text/UCA/CEList.java +++ b/tools/unicodetools/com/ibm/text/UCA/CEList.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/CEList.java,v $ -* $Date: 2002/05/31 01:41:03 $ -* $Revision: 1.4 $ +* $Date: 2002/06/15 02:47:12 $ +* $Revision: 1.5 $ * ******************************************************************************* */ @@ -165,6 +165,15 @@ public final class CEList implements java.lang.Comparable, UCD_Types { return result.toString(); } + public static String toString(IntStack ces) { + StringBuffer result = new StringBuffer(); + for (int i = 0; i < ces.length(); ++i) { + if (i != 0) result.append(' '); + result.append(toString(ces.get(i))); + } + return result.toString(); + } + public static String toString(int ce) { return "[" + Utility.hex(UCA.getPrimary(ce)) + "." + Utility.hex(UCA.getSecondary(ce)) + "." diff --git a/tools/unicodetools/com/ibm/text/UCA/Main.java b/tools/unicodetools/com/ibm/text/UCA/Main.java index 03bbb3a8649..203bebd66bf 100644 --- a/tools/unicodetools/com/ibm/text/UCA/Main.java +++ b/tools/unicodetools/com/ibm/text/UCA/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $ -* $Date: 2002/06/04 01:59:01 $ -* $Revision: 1.5 $ +* $Date: 2002/06/15 02:47:12 $ +* $Revision: 1.6 $ * ******************************************************************************* */ @@ -18,7 +18,8 @@ import com.ibm.text.utility.*; public class Main { static final String UCDVersion = ""; - static final String[] ICU_FILES = {"writeCollationValidityLog", "FractionalUCA", "writeconformance", "writeconformanceshifted", + static final String[] ICU_FILES = {"writeCollationValidityLog", "FractionalUCA", + "writeconformance", "writeconformanceshifted", "WriteRules", "WriteRulesWithNames", "WriteRulesXML"}; public static void main(String args[]) throws Exception { diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA.java b/tools/unicodetools/com/ibm/text/UCA/UCA.java index 8f3e1cb7953..ffd674516cf 100644 --- a/tools/unicodetools/com/ibm/text/UCA/UCA.java +++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ -* $Date: 2002/06/04 01:58:56 $ -* $Revision: 1.13 $ +* $Date: 2002/06/15 02:47:12 $ +* $Revision: 1.14 $ * ******************************************************************************* */ @@ -24,6 +24,7 @@ import com.ibm.text.UCD.Normalizer; import com.ibm.text.UCD.UCD; import com.ibm.text.utility.*; import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; //import com.ibm.text.CollationData.*; @@ -62,7 +63,7 @@ This is because of shared characters between scripts with different directions, like French with Arabic or Greek. */ -final public class UCA implements Comparator { +final public class UCA implements Comparator, UCA_Types { public static final String copyright = "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved."; @@ -85,19 +86,13 @@ final public class UCA implements Comparator { // base directory will change depending on the installation public static final String BASE_DIR = "c:\\DATA\\"; - /** Enum for alternate handling */ - public static final byte SHIFTED = 0, ZEROED = 1, NON_IGNORABLE = 2, SHIFTED_TRIMMED = 3, LAST = 3; - - /** - * Used to terminate a list of CEs - */ - public static final int TERMINATOR = 0xFFFFFFFF; // CE that marks end of string - // ============================================================= // Test Settings // ============================================================= static final boolean DEBUG = false; + static final boolean DEBUG_SHOW_LINE = false; + static final boolean SHOW_STATS = true; static final boolean SHOW_CE = false; @@ -109,6 +104,7 @@ final public class UCA implements Comparator { static final boolean RECORDING_CHARS = true; private UCD ucd; + private UCA_Data ucaData; // ============================================================= // Main Methods @@ -121,11 +117,7 @@ final public class UCA implements Comparator { */ public UCA(BufferedReader source, String unicodeVersion) throws java.io.IOException { fullData = source == null; - - // clear some tables - for (int i = 0; i < collationElements.length; ++i) { - collationElements[i] = UNSUPPORTED; - } + // load the normalizer if (toD == null) { toD = new Normalizer(Normalizer.NFD, unicodeVersion); @@ -134,6 +126,8 @@ final public class UCA implements Comparator { ucd = UCD.make(unicodeVersion); ucdVersion = ucd.getVersion(); + ucaData = new UCA_Data(toD, ucd); + // either get the full sources, or just a demo set if (fullData) { for (int i = 0; i < KEYS.length; ++i) { @@ -234,7 +228,7 @@ final public class UCA implements Comparator { } if (SHOW_CE) { if (debugList.length() != 0) debugList.append("/"); - debugList.append(ceToString(ce)); + debugList.append(CEList.toString(ce)); } // add weights @@ -412,6 +406,35 @@ final public class UCA implements Comparator { return target; } + /** + * Returns a list of CEs for a unicode character at a position. + * @param sourceString string to make a sort key for. + * @param offset position in string + * @param decomposition true for UCA, false where the text is guaranteed to be + * normalization form C with no combining marks of class 0. + * @param output array for output. Must be large enough on entry. When done, is terminated with TERMINATOR. + */ + public void getCEs(String sourceString, boolean decomposition, IntStack output) { + decompositionBuffer.setLength(0); + if (decomposition) { + toD.normalize(sourceString, decompositionBuffer); + } else { + decompositionBuffer.append(sourceString); + } + rearrangeBuffer = EMPTY; // clear the rearrange buffer (thai) + index = 0; + + // process CEs, building weight strings + while (true) { + //fixQuaternatiesPosition = quaternaries.length(); + int ce = getCE(); + if (ce == 0) continue; + if (ce == TERMINATOR) break; + output.push(ce); + } + } + + /** * Returns a list of CEs for a unicode character at a position. * @param sourceString string to make a sort key for. @@ -477,14 +500,6 @@ final public class UCA implements Comparator { return strength == 1 ? primarySet : strength == 2 ? secondarySet : tertiarySet; } - /** - * CE Type - */ - static final byte NORMAL_CE = 0, CONTRACTING_CE = 1, EXPANDING_CE = 2, - CJK_CE = 3, CJK_AB_CE = 4, HANGUL_CE = 5, UNSUPPORTED_CE = 7, - FIXED_CE = 3; - // SURROGATE_CE = 6, - /** * Returns the char associated with a FIXED value */ @@ -497,28 +512,7 @@ final public class UCA implements Comparator { * Return the type of the CE */ public byte getCEType(int ch) { - - if (ch > 0xFFFF) ch = UTF16.getLeadSurrogate(ch); // first if expands - - int ce = collationElements[ch]; - if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return NORMAL_CE; - if (ce == UNSUPPORTED) { - - // Special check for Han, Hangul - if (isHangul(ch)) return HANGUL_CE; - - if (isCJK(ch)) return CJK_CE; - if (isCJK_AB(ch)) return CJK_AB_CE; - - // special check for unsupported surrogate pair, 20 1/8 bits - //if (0xD800 <= ch && ch <= 0xDFFF) { - // return SURROGATE_CE; - //} - return UNSUPPORTED_CE; - } - - if (ce == CONTRACTING) return CONTRACTING_CE; - return EXPANDING_CE; + return ucaData.getCEType(ch); } /** @@ -604,19 +598,11 @@ final public class UCA implements Comparator { return result.toString(); } - /** - * Produces a human-readable string for a collation element - */ - static public String ceToString(int ce) { - return "[" + Utility.hex(getPrimary(ce)) + "." - + Utility.hex(getSecondary(ce)) + "." - + Utility.hex(getTertiary(ce)) + "]"; - } - /** * Produces a human-readable string for a collation element. * value is terminated by -1! */ + /* static public String ceToString(int[] ces, int len) { StringBuffer result = new StringBuffer(); for (int i = 0; i < len; ++i) { @@ -624,11 +610,13 @@ final public class UCA implements Comparator { } return result.toString(); } + &/ /** * Produces a human-readable string for a collation element. * value is terminated by -1! */ + /* static public String ceToString(int[] ces) { StringBuffer result = new StringBuffer(); for (int i = 0; ; ++i) { @@ -637,7 +625,7 @@ final public class UCA implements Comparator { } return result.toString(); } - + */ static boolean isImplicitLeadCE(int ce) { return isImplicitLeadPrimary(getPrimary(ce)); @@ -670,10 +658,10 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] * and to get the second part use (x & 0xFFFF) */ - static void CodepointToImplicit(int cp, int[] output) { + void CodepointToImplicit(int cp, int[] output) { int base = UNSUPPORTED_OTHER_BASE; - if (isCJK(cp)) base = UNSUPPORTED_CJK_BASE; - else if (isCJK_AB(cp)) base = UNSUPPORTED_CJK_AB_BASE; + if (ucd.isCJK_BASE(cp)) base = UNSUPPORTED_CJK_BASE; + else if (ucd.isCJK_AB(cp)) base = UNSUPPORTED_CJK_AB_BASE; output[0] = base + (cp >>> 15); output[1] = (cp & 0x7FFF) | 0x8000; } @@ -768,6 +756,9 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] // Privates // ============================================================= + + IntStack expandingStack = new IntStack(10); + /** * Array used to reorder surrogates to top of 16-bit range, and others down. * Adds 2000 to D800..DFFF, making them F800..FFFF @@ -847,77 +838,13 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] // Collation Element Memory Data Table Formats // ============================================================= - /** - * Used to composed Hangul and Han characters - */ - - static final int NEUTRAL_SECONDARY = 0x20; - static final int NEUTRAL_TERTIARY = 0x02; - /** * Temporary buffer used in getSortKey for the decomposed string */ private StringBuffer decompositionBuffer = new StringBuffer(); - /** - * The collation element data is stored a couple of different structures. - * First is collationElements, which generally contains the 32-bit CE corresponding - * to the data. It is directly indexed by character code.
- * For brevity in the implementation, we just use a flat array. - * A real implementation would use a multi-stage table, as described in TUS Section 5. - * table of simple collation elements, indexed by char.
- * Exceptional cases: expanding, contracting, unsupported are handled as described below. - */ - private int[] collationElements = new int[65536]; - - /** - * A special bit combination in a CE is used to reserve exception cases. This has the effect - * of removing a small number of the primary key values out of the 65536 possible. - */ - private static final int EXCEPTION_CE_MASK = 0xF8000000; - - - /** - * Any unsupported characters (those not in the UCA data tables) - * are marked with a exception bit combination - * so that they can be treated specially.
- * There are at least 34 values, so that we can use a range for surrogates - * However, we do add to the first weight if we have surrogate pairs! - */ - private static final int UNSUPPORTED_CJK_BASE = 0xFF40; - private static final int UNSUPPORTED_CJK_AB_BASE = 0xFF80; - private static final int UNSUPPORTED_OTHER_BASE = 0xFFC0; - - private static final int UNSUPPORTED_BASE = UNSUPPORTED_CJK_BASE; - private static final int UNSUPPORTED_LIMIT = UNSUPPORTED_OTHER_BASE + 0x40; - - private static final int UNSUPPORTED = makeKey(UNSUPPORTED_BASE, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY); - // was 0xFFC20101; - /** - * Contracting characters are marked with a exception bit combination - * in the collationElement table. - * This means that they are the first character of a contraction, and need - * to be looked up (with following characters) in the contractingTable.
- * This isn't a MASK since there is exactly one value. - */ - private static final int CONTRACTING = 0xFA310000; - - /** - * Expanding characters are marked with a exception bit combination - * in the collationElement table. - * This means that they map to more than one CE, which is looked up in - * the expansionTable by index. See EXCEPTION_INDEX_MASK - */ - private static final int EXPANDING_MASK = 0xFA300000; // marks expanding range start - - /** - * This mask is used to get the index from an EXPANDING exception. - * The contracting characters can also make use of this in a future optimization. - */ - static final int EXCEPTION_INDEX_MASK = 0x0000FFFF; - /** * We take advantage of the variables being in a closed range to save a bit per CE. * The low and high values are initially set to be at the opposite ends of the range, @@ -931,27 +858,18 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] private int variableLowCE; // used for testing against private int variableHighCE; // used for testing against - /** - * Although a single character can expand into multiple CEs, we don't want to burden - * the normal case with the storage. So, they get a special value in the collationElements - * array. This value has a distinct primary weight, followed by an index into a separate - * table called expandingTable. All of the CEs in that table, up to a TERMINATOR value - * will be used for the expansion. The implementation is as a stack; this just makes it - * easy to generate. - */ - private IntStack expandingTable = new IntStack(3600); // initial number is from compKeys - - /** - * For now, this is just a simple mapping of strings to collation elements. - * The implementation depends on the contracting characters being "completed", - * so that it can be efficiently determined when to stop looking. - */ - private Hashtable contractingTable = new Hashtable(); + /* - /** - * Special char value that means failed or terminated - */ - private static final char NOT_A_CHAR = '\uFFFF'; + private void fixSurrogateContraction(char ch) { + //if (DEBUGCHAR) System.out.println(Utility.hex(ch) + ": " + line.substring(0, position[0]) + "|" + line.substring(position[0])); + if (ch == NOT_A_CHAR || !UTF16.isLeadSurrogate(ch)) return; + String chs = String.valueOf(ch); + Object probe = contractingTable.get(chs); + if (probe != null) return; + contractingTable.put(chs, new Integer(UNSUPPORTED)); + } + + */ /** * Marks whether we are using the full data set, or an abbreviated version for @@ -965,11 +883,6 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] // Made part of the object to avoid reallocating each time. // ============================================================= - /** - * Stack for expanding characters - */ - private IntStack expandingStack = new IntStack(100); - /** * Temporary buffers used in getSortKey to store weights * these are NOT strings of Unicode characters--they are @@ -990,8 +903,6 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] * Temporary with requested decomposition */ boolean storedDecomposition; - int hangulHackBottom; - int hangulHackTop; /** * Used for supporting Thai rearrangement @@ -1015,7 +926,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] * (normalized) character code. */ private int getCE() { - if (!expandingStack.isEmpty()) return expandingStack.pop(); + if (!expandingStack.isEmpty()) return expandingStack.popFront(); char ch; // Fetch next character. Handle rearrangement for Thai, etc. @@ -1037,190 +948,56 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] } } - int ce = collationElements[ch]; - - // Hangul tailoring hack - //if (!storedDecomposition && hangulHackBottom <= ce && ce < hangulHackTop) return fixJamo(ch, ce); // hard coded fix!! - - // if the CE is not exceptional (unsupported, contracting, expanding) we are done. - if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return ce; - - if (ce == UNSUPPORTED) { - int bigChar = ch; + index = ucaData.get(ch, decompositionBuffer, index, expandingStack); + int ce = expandingStack.popFront(); // pop first (guaranteed to exist!) + if (ce == UNSUPPORTED_FLAG) { + return handleUnsupported(ch); + } + return ce; + } + + private int handleUnsupported(char ch) { + int bigChar = ch; - // Special check for Hangul - if (isHangul(bigChar)) { - // MUST DECOMPOSE!! - hangulBuffer = new StringBuffer(); - decomposeHangul(bigChar, hangulBuffer); - return getCE(); - // RECURSIVE!!! - } + // Special check for Hangul + if (ucd.isHangulSyllable(bigChar)) { + // MUST DECOMPOSE!! + hangulBuffer = new StringBuffer(); + decomposeHangul(bigChar, hangulBuffer); + return getCE(); + // RECURSIVE!!! + } + + // special check and fix for unsupported surrogate pair, 20 1/8 bits + if (0xD800 <= bigChar && bigChar <= 0xDFFF) { + // ignore unmatched surrogates (e.g. return zero) + if (bigChar >= 0xDC00 || index >= decompositionBuffer.length()) return 0; // unmatched + int ch2 = decompositionBuffer.charAt(index); + if (ch2 < 0xDC00 || 0xDFFF < ch2) return 0; // unmatched + index++; // skip next char + bigChar = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00); // extract value + } + - if (ucd.isNoncharacter(bigChar)) { // illegal code value, ignore!! - return 0; - } + if (ucd.isNoncharacter(bigChar)) { // illegal code value, ignore!! + return 0; + } - // special check and fix for unsupported surrogate pair, 20 1/8 bits - if (0xD800 <= bigChar && bigChar <= 0xDFFF) { - // ignore unmatched surrogates (e.g. return zero) - if (bigChar >= 0xDC00 || index >= decompositionBuffer.length()) return 0; // unmatched - int ch2 = decompositionBuffer.charAt(index); - if (ch2 < 0xDC00 || 0xDFFF < ch2) return 0; // unmatched - index++; // skip next char - bigChar = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00); // extract value - } - - // find the implicit values; returned in 0 and 1 - int[] implicit = new int[2]; - CodepointToImplicit(bigChar, implicit); + // find the implicit values; returned in 0 and 1 + int[] implicit = new int[2]; + CodepointToImplicit(bigChar, implicit); - // Now compose the two keys - // first push BBBB, which is #1 + // Now compose the two keys + + // push BBBB - expandingStack.push(makeKey(implicit[1], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY)); + expandingStack.push(makeKey(implicit[1], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY)); + + // return AAAA - // now return AAAA, which is #0 - - return makeKey(implicit[0], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY); + return makeKey(implicit[0], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY); + - } - if (ce == CONTRACTING) { - // Contracting is probably the most interesting (read "tricky") part - // of the algorithm. - // First get longest substring that is in the contracting table. - // For simplicity, we use a hash table for contracting. - // There are much better optimizations, - // but they take a more complicated build algorithm than we want to show here. - // NOTE: We are guaranteed that the character itself is in the contracting table because - // of the build process. - String probe = String.valueOf(ch); - Object value = contractingTable.get(probe); - if (value == null) throw new IllegalArgumentException("Missing value for " + Utility.hex(ch)); - - // We loop, trying to add successive characters to the longest substring. - while (index < decompositionBuffer.length()) { - char ch2 = decompositionBuffer.charAt(index); - - // see whether the current string plus the next char are in - // the contracting table. - String newProbe = probe + ch2; - Object newValue = contractingTable.get(newProbe); - if (newValue == null) break; // stop if not in table. - - // We succeeded--so update our new values, and set index - // and quaternary to indicate that we swallowed another character. - probe = newProbe; - value = newValue; - index++; - } - - // Now, see if we can add any combining marks - short lastCan = 0; - for (int i = index; i < decompositionBuffer.length(); ++i) { - // We only take certain characters. They have to be accents, - // and they have to not be blocked. - // Unlike above, if we don't find a match (and it was an accent!) - // then we don't stop, we continue looping. - char ch2 = decompositionBuffer.charAt(i); - short can = toD.getCanonicalClass(ch2); - if (can == 0) break; // stop with any zero (non-accent) - if (can == lastCan) continue; // blocked if same class as last - lastCan = can; // remember for next time - - // Now see if we can successfully add it onto our string - // and find it in the contracting table. - String newProbe = probe + ch2; - Object newValue = contractingTable.get(newProbe); - if (newValue == null) continue; - - // We succeeded--so update our new values, remove the char, and update - // quaternary to indicate that we swallowed another character. - probe = newProbe; - value = newValue; - decompositionBuffer.setCharAt(i,'\u0000'); // zero char - } - - // we are all done, and can extract the CE from the last value set. - ce = ((Integer)value).intValue(); - // if the CE is not exceptional (unsupported expanding) we are done. - // BTW we will never have a contracting CE at this point. - if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return ce; - // otherwise fall through to expansion - } - // expanding, so copy list of items onto stack - int index = ce & EXCEPTION_INDEX_MASK; // get index - // copy onto stack from index until reach TERMINATOR - while (true) { - ce = expandingTable.get(index++); - if (ce == TERMINATOR) break; - expandingStack.push(ce); - } - return expandingStack.pop(); // pop last (guaranteed to exist!) - } - - // Neither Mapped nor Composite CJK: [\u3400-\u4DB5\u4E00-\u9FA5\U00020000-\U0002A6D6] - - public static boolean isCJK(int cp) { - return (CJK_BASE <= cp && cp < CJK_LIMIT - || cp == 0xFA0E // compat characters that don't decompose. - || cp == 0xFA0F - || cp == 0xFA11 - || cp == 0xFA13 - || cp == 0xFA14 - || cp == 0xFA1F - || cp == 0xFA21 - || cp == 0xFA23 - || cp == 0xFA24 - || cp == 0xFA27 - || cp == 0xFA28 - || cp == 0xFA29 - || cp == 0xFA2E - || cp == 0xFA2F - ); - } - - public static final int - CJK_BASE = 0x4E00, - CJK_LIMIT = 0x9FFF+1, - CJK_COMPAT_USED_BASE = 0xFA0E, - CJK_COMPAT_USED_LIMIT = 0xFA2F+1, - CJK_A_BASE = 0x3400, - CJK_A_LIMIT = 0x4DBF+1, - CJK_B_BASE = 0x20000, - CJK_B_LIMIT = 0x2A6DF+1; - - public static final boolean isCJK_AB(int bigChar) { - return (CJK_A_BASE <= bigChar && bigChar < CJK_A_LIMIT - || CJK_B_BASE <= bigChar && bigChar < CJK_B_LIMIT); - } -/* -2E80..2EFF; CJK Radicals Supplement -2F00..2FDF; Kangxi Radicals - -3400..4DBF; CJK Unified Ideographs Extension A -4E00..9FFF; CJK Unified Ideographs -F900..FAFF; CJK Compatibility Ideographs - -20000..2A6DF; CJK Unified Ideographs Extension B -2F800..2FA1F; CJK Compatibility Ideographs Supplement - -Compat: -# F900..FA0D [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D -# FA10 CJK COMPATIBILITY IDEOGRAPH-FA10 -# FA12 CJK COMPATIBILITY IDEOGRAPH-FA12 -# FA15..FA1E [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E -# FA20 CJK COMPATIBILITY IDEOGRAPH-FA20 -# FA22 CJK COMPATIBILITY IDEOGRAPH-FA22 -# FA25..FA26 [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26 -# FA2A..FA2D [4] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA2D -# FA30..FA6A [59] CJK COMPATIBILITY IDEOGRAPH-FA30..CJK COMPATIBILITY IDEOGRAPH-FA6A -# 2F800..2FA1D [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D - -*/ - - private final boolean isHangul(int bigChar) { - return (0xAC00 <= bigChar && bigChar <= 0xD7A3); } /** @@ -1287,12 +1064,12 @@ Compat: */ private int count1 = 0, count2 = 0, count3 = 0, max2 = 0, max3 = 0; private int oldKey1 = -1, oldKey2 = -1, oldKey3 = -1; - Map multiTable = new TreeMap(); - BitSet found = new BitSet(); + UnicodeSet found = new UnicodeSet(); - public Hashtable getContracting() { + /*public Hashtable getContracting() { return new Hashtable(multiTable); } + */ public UCAContents getContents(byte ceLimit, Normalizer skipDecomps) { return new UCAContents(ceLimit, skipDecomps, ucdVersion); @@ -1317,6 +1094,16 @@ Compat: this.ceLimit = ceLimit; this.nfd = new Normalizer(Normalizer.NFD, unicodeVersion); this.skipDecomps = skipDecomps; + + // FIX SAMPLES + if (SAMPLE_RANGES[0][0] == 0) { + for (int i = 0; ; ++i) { // add first unallocated character + if (!ucd.isAssigned(i)) { + SAMPLE_RANGES[0][0] = i; + break; + } + } + } } /** @@ -1334,7 +1121,9 @@ Compat: // normal case while (current++ < 0x10FFFF) { - + if (current == 0x406) { + System.out.println("DEBUG"); + } //char ch = (char)current; byte type = getCEType(current); if (type >= ceLimit || type == CONTRACTING_CE) continue; @@ -1349,15 +1138,18 @@ Compat: } // contractions - if (enum == null) enum = multiTable.keySet().iterator(); - if (enum.hasNext()) { + if (enum == null) enum = ucaData.getContractions(); + while (enum.hasNext()) { result = (String)enum.next(); + if (result.length() == 1 && UTF16.isLeadSurrogate(result.charAt(0))) { + //System.out.println("Skipping " + ucd.getCodeAndName(result)); + continue; // try again + } return result; } // extra samples if (currentRange < SAMPLE_RANGES.length) { - System.out.println("*"); try { result = UTF16.valueOf(itemInRange); } catch (RuntimeException e) { @@ -1372,10 +1164,11 @@ Compat: endOfRange = SAMPLE_RANGES[currentRange].length > 1 ? SAMPLE_RANGES[currentRange][1] : startOfRange; - skip = ((endOfRange - startOfRange) / 513); + //skip = ((endOfRange - startOfRange) / 3); } - } else if (itemInRange > startOfRange + 9 && itemInRange < endOfRange - 9 - skip) { - itemInRange += skip; + } else if (itemInRange > startOfRange + 5 && itemInRange < endOfRange - 5 /* - skip*/) { + //itemInRange += skip; + itemInRange = endOfRange - 5; } } @@ -1410,14 +1203,16 @@ Compat: } static final int[][] SAMPLE_RANGES = { - {0x10000}, - {0x10FFFF}, - {0x0220}, + {0}, // LEAVE EMPTY--Turns into first unassigned character {0xFFF0}, {0xD800}, {0xDFFF}, {0xFFFE}, {0xFFFF}, + {0x10000}, + {0xC0000}, + {0xD0000}, + {0x10FFFF}, {0x10FFFE}, {0x10FFFF}, {0x3400, 0x4DB5}, @@ -1426,7 +1221,7 @@ Compat: {0xA000, 0xA48C}, {0xE000, 0xF8FF}, {0x20000, 0x2A6D6}, - {0xE0000, 0xE00FF}, + {0xE0000, 0xE007E}, {0xF0000, 0xF00FD}, {0xFFF00, 0xFFFFD}, {0x100000, 0x1000FD}, @@ -1438,7 +1233,7 @@ Compat: * Values will override any previous mappings. */ private void addCollationElements(BufferedReader in) throws java.io.IOException { - IntStack tempStack = new IntStack(100); // used for reversal + IntStack tempStack = new IntStack(100); StringBuffer multiChars = new StringBuffer(); // used for contracting chars String inputLine = ""; boolean[] wasImplicitLeadPrimary = new boolean[1]; @@ -1448,6 +1243,10 @@ Compat: if (inputLine == null) break; // means file is done String line = cleanLine(inputLine); // remove comments, extra whitespace if (line.length() == 0) continue; // skip empty lines + + if (DEBUG_SHOW_LINE) { + System.out.println("Processing: " + inputLine); + } position[0] = 0; // start at front of line if (line.startsWith("@version")) { @@ -1464,29 +1263,21 @@ Compat: } // collect characters - char value = getChar(line, position); - fixSurrogateContraction(value); - char value2 = getChar(line, position); multiChars.setLength(0); // clear buffer - if (value2 != NOT_A_CHAR) { - fixSurrogateContraction(value2); - multiChars.append(value); // append until we get terminator + + char value = getChar(line, position); + multiChars.append(value); + + //fixSurrogateContraction(value); + char value2 = getChar(line, position); + // append until we get terminator + while (value2 != NOT_A_CHAR) { multiChars.append(value2); - while (true) { - value2 = getChar(line, position); - if (value2 == NOT_A_CHAR) break; - fixSurrogateContraction(value2); - multiChars.append(value2); - } + value2 = getChar(line, position); } + if (RECORDING_CHARS) { - if (multiChars.length() > 1) { - multiTable.put(multiChars.toString(), ""); - } - found.set(value); - for (int i = 1; i < multiChars.length(); ++i) { - found.set(multiChars.charAt(i)); - } + found.addAll(multiChars.toString()); } if (!fullData && RECORDING_DATA) { if (value == 0 || value == '\t' || value == '\n' || value == '\r' @@ -1522,141 +1313,69 @@ Compat: } } } - if (ce2 != TERMINATOR) { // have expanding character! - // put list into the expanding table - // use a temporary stack to get them in reverse order - tempStack.push(ce); - tempStack.push(ce2); - // set collationElement to exception value, plus index - ce = EXPANDING_MASK | expandingTable.getTop(); - while (true) { - ce2 = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary); - if (ce2 == TERMINATOR) break; - tempStack.push(ce2); - } - // push onto expanding table, now in reverse order - while (!tempStack.isEmpty()) expandingTable.push(tempStack.pop()); - expandingTable.push(TERMINATOR); - } - //if (value == 0xd801) System.out.print("DEBUG: " + line); - - // assign CE(s) to char(s) - if (multiChars.length() > 0) { - contractingTable.put(multiChars.toString(), new Integer(ce)); - if (collationElements[value] == UNSUPPORTED) { - collationElements[value] = CONTRACTING; // mark special - } else if (collationElements[value] != CONTRACTING) { - // move old value to contracting table! - contractingTable.put(String.valueOf(value), new Integer(collationElements[value])); - collationElements[value] = CONTRACTING; // signal we must look up in table - } - } else if (collationElements[value] == CONTRACTING) { - // must add old value to contracting table! - contractingTable.put(String.valueOf(value), new Integer(ce)); - } else { - collationElements[value] = ce; // normal - } - //} catch (Exception e) { - // throw new IllegalArgumentException("Malformed line: " + inputLine + "\n " - // + e.getClass().getName() + ": " + e.getMessage()); + tempStack.clear(); + tempStack.push(ce); + + while (ce2 != TERMINATOR) { + tempStack.push(ce2); + ce2 = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary); + if (ce2 == TERMINATOR) break; + } + + ucaData.add(multiChars, tempStack); + } catch (RuntimeException e) { System.out.println("Error on line: " + inputLine); throw e; } } - private void fixSurrogateContraction(char ch) { - //if (DEBUGCHAR) System.out.println(Utility.hex(ch) + ": " + line.substring(0, position[0]) + "|" + line.substring(position[0])); - if (ch == NOT_A_CHAR || !UTF16.isLeadSurrogate(ch)) return; - String chs = String.valueOf(ch); - Object probe = contractingTable.get(chs); - if (probe != null) return; - contractingTable.put(chs, new Integer(0)); - } - + /* private void concat(int[] ces1, int[] ces2) { } - - private void add(String source, int[] ces, int ceLen) { - - int ce; - if (ceLen < 1) { - throw new IllegalArgumentException("CE too short: " + ceLen); - } else if (ceLen == 1) { - ce = ces[0]; - } else { - ce = EXPANDING_MASK | expandingTable.getTop(); - for (int i = 0; i < ceLen; ++i) { - expandingTable.push(ces[i]); - } - } - - // assign CE(s) to char(s) - int value = source.charAt(0); - //if (value == 0x10000) System.out.print("DEBUG2: " + source); - - if (source.length() > 0) { - contractingTable.put(source.toString(), new Integer(ce)); - if (collationElements[value] == UNSUPPORTED) { - collationElements[value] = CONTRACTING; // mark special - } else if (collationElements[value] != CONTRACTING) { - // move old value to contracting table! - contractingTable.put(String.valueOf(value), new Integer(collationElements[value])); - collationElements[value] = CONTRACTING; // signal we must look up in table - } - } else if (collationElements[value] == CONTRACTING) { - // must add old value to contracting table! - contractingTable.put(source, new Integer(ce)); - } else { - collationElements[source.charAt(0)] = ce; // normal - } - } + */ /** * Checks the internal tables corresponding to the UCA data. */ private void cleanup() { - // at this point, we have to guarantee that the contractingTable is CLOSED - // e.g. if a substring of length n is in the table, then the first n-1 characters - // are also!! + ucaData.checkConsistency(); + + Map missingStrings = new HashMap(); + Map tempMap = new HashMap(); - -/* -0FB2 0F71 ; [.124E.0020.0002.0FB2][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER RA + TIBETAN VOWEL SIGN AA -0FB3 0F71 ; [.1250.0020.0002.0FB3][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER LA + TIBETAN VOWEL SIGN AA - int[] temp1 = int[20]; - int[] temp2 = int[20]; - int[] temp3 = int[20]; - getCEs("\u0fb2", true, temp1); - getCEs("\u0fb3", true, temp2); - getCEs("\u0f71", true, temp3); - add("\u0FB2\u0F71", concat(temp1, temp3)); -*/ - - Hashtable missingStrings = new Hashtable(); - - int[] temp1 = new int[20]; - Enumeration enum = contractingTable.keys(); - while (enum.hasMoreElements()) { - String sequence = (String)enum.nextElement(); + Iterator enum = ucaData.getContractions(); + while (enum.hasNext()) { + String sequence = (String)enum.next(); //System.out.println("Contraction: " + Utility.hex(sequence)); for (int i = sequence.length()-1; i > 0; --i) { String shorter = sequence.substring(0,i); - Object probe = contractingTable.get(shorter); - if (probe == null) { - int len = getCEs(shorter, true, temp1); - if (false) System.out.println("WARNING: CLOSING: " + UCD.make().getCodeAndName(shorter) + " => " + ceToString(temp1, len)); - add(shorter, temp1, len); + if (!ucaData.contractionTableContains(shorter)) { + IntStack tempStack = new IntStack(1); + getCEs(shorter, true, tempStack); + if (false) System.out.println("WARNING: CLOSING: " + ucd.getCodeAndName(shorter) + + " => " + CEList.toString(tempStack)); + tempMap.put(shorter, tempStack); // missingStrings.put(shorter,""); // collationElements[sequence.charAt(0)] = UNSUPPORTED; // nuke all bad values } } } - enum = missingStrings.keys(); + // now add them. We couldn't before because we were iterating over it. + + enum = tempMap.keySet().iterator(); + while (enum.hasNext()) { + String shorter = (String) enum.next(); + IntStack tempStack = (IntStack) tempMap.get(shorter); + ucaData.add(shorter, tempStack); + } + + + enum = missingStrings.keySet().iterator(); if (missingStrings.size() != 0) { /** while (enum.hasMoreElements()) { @@ -1666,26 +1385,30 @@ Compat: } */ String errorMessage = ""; - while (enum.hasMoreElements()) { - String missing = (String)enum.nextElement(); + while (enum.hasNext()) { + String missing = (String)enum.next(); if (errorMessage.length() != 0) errorMessage += ", "; errorMessage += "\"" + missing + "\""; } throw new IllegalArgumentException("Contracting table not closed! Missing " + errorMessage); } - + //fixlater; variableLowCE = variableLow << 16; variableHighCE = (variableHigh << 16) | 0xFFFF; // turn on bottom bits - hangulHackBottom = collationElements[0x1100] & 0xFFFF0000; // remove secondaries & tertiaries - hangulHackTop = collationElements[0x11F9] | 0xFFFF; // bump up secondaries and tertiaries - if (SHOW_STATS) System.out.println("\tHangul Hack: " + Utility.hex(hangulHackBottom) + ", " + Utility.hex(hangulHackTop)); + //int hangulHackBottom; + //int hangulHackTop; + + //hangulHackBottom = collationElements[0x1100] & 0xFFFF0000; // remove secondaries & tertiaries + //hangulHackTop = collationElements[0x11F9] | 0xFFFF; // bump up secondaries and tertiaries + //if (SHOW_STATS) System.out.println("\tHangul Hack: " + Utility.hex(hangulHackBottom) + ", " + Utility.hex(hangulHackTop)); // show some statistics if (SHOW_STATS) System.out.println("\tcount1: " + count1); if (SHOW_STATS) System.out.println("\tcount2: " + max2); if (SHOW_STATS) System.out.println("\tcount3: " + max3); + if (SHOW_STATS) System.out.println("\tcontractions: " + ucaData.getContractionCount()); if (SHOW_STATS) System.out.println("\tMIN1/MAX1: " + Utility.hex(MIN1) + "/" + Utility.hex(MAX1)); if (SHOW_STATS) System.out.println("\tMIN2/MAX2: " + Utility.hex(MIN2) + "/" + Utility.hex(MAX2)); @@ -1912,7 +1635,7 @@ Compat: /** * Used for checking data file integrity */ - private Hashtable uniqueTable = new Hashtable(); + private Map uniqueTable = new HashMap(); /** * Used for checking data file integrity diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java index 669dda374b0..3a1d8e83d2d 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ -* $Date: 2002/06/13 21:14:05 $ -* $Revision: 1.18 $ +* $Date: 2002/06/15 02:47:12 $ +* $Revision: 1.19 $ * ******************************************************************************* */ @@ -31,9 +31,12 @@ import com.ibm.text.UCD.UCD_Types; import com.ibm.text.utility.*; import com.ibm.text.UCD.Normalizer; -public class WriteCollationData implements UCD_Types { +public class WriteCollationData implements UCD_Types, UCA_Types { static final boolean DEBUG = false; + static final boolean DEBUG_SHOW_ITERATION = true; + + public static final String copyright = "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved."; @@ -289,7 +292,21 @@ public class WriteCollationData implements UCD_Types { static void writeConformance(String filename, byte option, boolean shortPrint) throws IOException { - UCD ucd30 = UCD.make("3.0.0"); + //UCD ucd30 = UCD.make("3.0.0"); + +/* +U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON + => U+00DC LATIN CAPITAL LETTER U WITH DIAERESIS, U+0304 COMBINING MACRON +*/ + String[] testList = {"\uF934", "U", "U\u0308", "\u00DC", "\u00DC\u0304", "U\u0308\u0304"}; + for (int jj = 0; jj < testList.length; ++jj) { + String t = testList[jj]; + System.out.println(ucd.getCodeAndName(t)); + String test = collator.getSortKey(t, UCA.NON_IGNORABLE); + System.out.println("Decomp: " + collator.toString(test)); + test = collator.getSortKey(t, UCA.NON_IGNORABLE, false); + System.out.println("No Dec: " + collator.toString(test)); + } PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt", true, false); if (!shortPrint) log.write('\uFEFF'); @@ -297,9 +314,39 @@ public class WriteCollationData implements UCD_Types { System.out.println("Sorting"); int counter = 0; - for (int i = 0; i <= 0x10FFFF; ++i) { + UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null); + cc.enableSamples(); + UnicodeSet found2 = new UnicodeSet(); + + while (true) { + String s = cc.next(); + if (s == null) break; + + found2.addAll(s); + + if (DEBUG_SHOW_ITERATION) { + int cp = UTF16.charAt(s, 0); + if (cp == 0x220 || !ucd.isAssigned(cp) || ucd.isCJK_BASE(cp)) { + System.out.println(ucd.getCodeAndName(s)); + } + } Utility.dot(counter++); - if (!ucd.isRepresented(i)) continue; + addStringX(s, option); + // TODO: add other accents with Cyrillic + } + + UnicodeSet found = collator.found; + if (!found2.containsAll(found2)) { + System.out.println("In both: " + new UnicodeSet(found).retainAll(found2).toPattern(true)); + System.out.println("In UCA but not iteration: " + new UnicodeSet(found).removeAll(found2).toPattern(true)); + System.out.println("In iteration but not UCA: " + new UnicodeSet(found2).removeAll(found).toPattern(true)); + throw new IllegalArgumentException("Inconsistent data"); + + } + + /* + for (int i = 0; i <= 0x10FFFF; ++i) { + if (!ucd.isAssigned(i)) continue; addStringX(UTF32.valueOf32(i), option); } @@ -318,15 +365,6 @@ public class WriteCollationData implements UCD_Types { addStringX(s, option); } - for (int i = 0; ; ++i) { // add first unallocated character - if (!ucd.isAssigned(i)) { - String s = UTF32.valueOf32(i); - Utility.fixDot(); - System.out.println("Adding: " + Utility.hex(s)); - addStringX(s, option); - break; - } - } for (int i = 0; i < extraConformanceRanges.length; ++i) { @@ -343,6 +381,7 @@ public class WriteCollationData implements UCD_Types { addStringX(end-1, option); addStringX(end, option); } + */ Utility.fixDot(); System.out.println("Total: " + sortedD.size()); @@ -364,12 +403,12 @@ public class WriteCollationData implements UCD_Types { //String status = key.equals(lastKey) ? "*" : ""; //lastKey = key; //log.println(source); + char extra = source.charAt(source.length()-1); String clipped = source.substring(0, source.length()-1); - String stren = source.substring(source.length()-1); if (!shortPrint) { log.print(Utility.hex(source)); log.print( - ";\t#" + ucd.getName(clipped) + "\t" + UCA.toString(key)); + ";\t# " + (extra != LOW_ACCENT ? extra : '.') + " " + ucd.getName(clipped, SHORT) + "\t" + UCA.toString(key)); } else { log.print(source + "\t" + Utility.hex(clipped)); } @@ -384,13 +423,15 @@ public class WriteCollationData implements UCD_Types { static void addStringX(int x, byte option) { addStringX(UTF32.valueOf32(x), option); } + + static final char LOW_ACCENT = '\u0325'; static void addStringX(String s, byte option) { addStringY(s + 'a', option); addStringY(s + 'A', option); addStringY(s + 'á', option); addStringY(s + 'b', option); - addStringY(s + '\u0325', option); + addStringY(s + LOW_ACCENT, option); addStringY(s + '!', option); } @@ -527,7 +568,7 @@ public class WriteCollationData implements UCD_Types { if (!arraysMatch(kenCes, kenLen, markCes, markLen)) { int kenCLen = fixCompatibilityCE(s, true, kenComp, true); - String comp = collator.ceToString(kenComp, kenCLen); + String comp = CEList.toString(kenComp, kenCLen); if (arraysMatch(kenCes, kenLen, kenComp, kenCLen)) { forLater.put((char)(COMPRESSED | type) + s, comp); @@ -567,10 +608,10 @@ public class WriteCollationData implements UCD_Types { String comp = (String)forLater.get(key); int kenLen = collator.getCEs(s, decompType, kenCes); - String kenStr = collator.ceToString(kenCes, kenLen); + String kenStr = CEList.toString(kenCes, kenLen); int markLen = fixCompatibilityCE(s, true, markCes, false); - String markStr = collator.ceToString(markCes, markLen); + String markStr = CEList.toString(markCes, markLen); if ((type & COMPRESSED) != 0) { log.println("COMPRESSED #" + (++count) + ": " + ucd.getCodeAndName(s)); @@ -589,7 +630,7 @@ public class WriteCollationData implements UCD_Types { log.println("NFD : " + ucd.getCodeAndName(nfd)); } //kenCLen = collator.getCEs(decomp, true, kenComp); - //log.println("decomp ce: " + collator.ceToString(kenComp, kenCLen)); + //log.println("decomp ce: " + CEList.toString(kenComp, kenCLen)); } log.println(); } @@ -785,7 +826,7 @@ public class WriteCollationData implements UCD_Types { if (s.length() > 1) { diLog.println(Utility.hex(s, " ") - + ";\t #" + collator.ceToString(ces, len) + + ";\t #" + CEList.toString(ces, len) + " ( " + s + " )" + " " + ucd.getName(s)); } @@ -859,7 +900,7 @@ public class WriteCollationData implements UCD_Types { ccc = UTF32.char32At(s,kk); byte cat = ucd.getCategory(ccc); if (cat == Cf || cat == Cc || cat == Zs || cat == Zl || cat == Zp) { - sortedCodes.add(UCA.ceToString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s)); + sortedCodes.add(CEList.toString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s)); break; } } @@ -882,7 +923,7 @@ public class WriteCollationData implements UCD_Types { if (collator.isVariable(ce)) haveMixture |= 1; else haveMixture |= 2; if (haveMixture == 3) { - mixedCEs.add(UCA.ceToString(ces, len) + "\t" + ucd.getCodeAndName(s)); + mixedCEs.add(CEList.toString(ces, len) + "\t" + ucd.getCodeAndName(s)); } } } @@ -1030,7 +1071,7 @@ public class WriteCollationData implements UCD_Types { ccc = UTF32.char32At(s,kk); byte cat = ucd.getCategory(ccc); if (cat == Cf || cat == Cc || cat == Zs || cat == Zl || cat == Zp) { - sortedCodes.add(UCA.ceToString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s)); + sortedCodes.add(CEList.toString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s)); break; } } @@ -1053,7 +1094,7 @@ public class WriteCollationData implements UCD_Types { if (collator.isVariable(ce)) haveMixture |= 1; else haveMixture |= 2; if (haveMixture == 3) { - mixedCEs.add(UCA.ceToString(ces, len) + "\t" + ucd.getCodeAndName(s)); + mixedCEs.add(CEList.toString(ces, len) + "\t" + ucd.getCodeAndName(s)); } } } @@ -1130,8 +1171,8 @@ public class WriteCollationData implements UCD_Types { + "\t" + head //+ "\t" + Utility.hex(oldWeight) //+ " => " + Utility.hex(newWeight) - + "\t" + collator.ceToString(ces, len) - + (doNew ? " => " + collator.ceToString(newCes, newLen) : "") + + "\t" + CEList.toString(ces, len) + + (doNew ? " => " + CEList.toString(newCes, newLen) : "") + "\t( " + src + " )" + "\t" + ucd.getName(src) ); @@ -1198,7 +1239,7 @@ public class WriteCollationData implements UCD_Types { if (false) { int len2 = collator.getCEs("\u2474", true, ces); - System.out.println(UCA.ceToString(ces, len2)); + System.out.println(CEList.toString(ces, len2)); String a = collator.getSortKey("a"); String b = collator.getSortKey("A"); @@ -1442,9 +1483,9 @@ F900..FAFF; CJK Compatibility Ideographs if (false) System.out.println( - collator.ceToString(lastCE) + " " - + collator.ceToString(ce) + " " - + collator.ceToString(nextCE) + " " + CEList.toString(lastCE) + " " + + CEList.toString(ce) + " " + + CEList.toString(nextCE) + " " + ucd.getCodeAndName(chr) ); @@ -1513,7 +1554,7 @@ F900..FAFF; CJK Compatibility Ideographs */ if (chr.equals("\u2F00")) { - System.out.println(UCA.ceToString(ces, len)); + System.out.println(CEList.toString(ces, len)); } // There are double-CEs, so we have to know what the length of the first bit is. @@ -1561,7 +1602,7 @@ F900..FAFF; CJK Compatibility Ideographs if (expansion.length() > 0) log.print(" / " + quoteOperand(expansion)); if (option == WITH_NAMES) { log.print("\t# " - + collator.ceToString(ces, len) + " " + + CEList.toString(ces, len) + " " + ucd.getCodeAndName(chr)); if (expansion.length() > 0) log.print(" / " + Utility.hex(expansion)); } @@ -1801,7 +1842,7 @@ F900..FAFF; CJK Compatibility Ideographs // we failed completely. Print error message, and bail - System.out.println("No back map for " + collator.ceToString(ces[i]) + System.out.println("No back map for " + CEList.toString(ces[i]) + " from " + CEList.toString(ces, len)); System.out.println("\t" + ucd.getCodeAndName(chr) + " => " + ucd.getCodeAndName(nfkdNew.normalize(chr)) @@ -2126,6 +2167,7 @@ F900..FAFF; CJK Compatibility Ideographs continue; } canIt.setSource(key); + boolean first = true; while (true) { String s = canIt.next(); @@ -2134,9 +2176,6 @@ F900..FAFF; CJK Compatibility Ideographs if (contentsForCanonicalIteration.contains(s)) continue; if (additionalSet.contains(s)) continue; - if (s.equals("\u01EC")) { - System.out.println("01ec"); - } // Skip anything that is not FCD. if (!NFD.isFCD(s)) continue; @@ -2234,7 +2273,7 @@ F900..FAFF; CJK Compatibility Ideographs log.println("# - Differs from previous version in that MAX value was introduced at 1F."); log.println("# All tertiary values are shifted down by 1, filling the gap at 7!"); - int firstImplicit = getImplicitPrimary(UCA.CJK_BASE) >>> 24; + int firstImplicit = getImplicitPrimary(CJK_BASE) >>> 24; int lastImplicit = getImplicitPrimary(0x10FFFF) >>> 24; log.println("[FIRST_IMPLICIT= " + Utility.hex(firstImplicit) + "]"); log.println("[LAST_IMPLICIT= " + Utility.hex(lastImplicit) + "]"); @@ -2285,13 +2324,15 @@ F900..FAFF; CJK Compatibility Ideographs int sec = UCA.getSecondary(ces[q]); int ter = UCA.getTertiary(ces[q]); - oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16); + oldStr.append(CEList.toString(ces[q]));// + "," + Integer.toString(ces[q],16); // special treatment for unsupported! if (UCA.isImplicitLeadPrimary(pri)) { + System.out.println("DEBUG: " + CEList.toString(ces, len) + + ", Current: " + q + ", " + ucd.getCodeAndName(chr)); ++q; - oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16); + oldStr.append(CEList.toString(ces[q]));// + "," + Integer.toString(ces[q],16); int pri2 = UCA.getPrimary(ces[q]); // get old code point @@ -2301,7 +2342,7 @@ F900..FAFF; CJK Compatibility Ideographs // double check results! int[] testImplicit = new int[2]; - UCA.CodepointToImplicit(cp, testImplicit); + collator.CodepointToImplicit(cp, testImplicit); boolean gotError = pri != testImplicit[0] || pri2 != testImplicit[1]; if (gotError) { System.out.println("ERROR"); @@ -2360,7 +2401,7 @@ F900..FAFF; CJK Compatibility Ideographs } if (nonePrinted) { log.print("[,,]"); - oldStr.append(UCA.ceToString(0)); + oldStr.append(CEList.toString(0)); } longLog.print(" # " + oldStr + " # " + ucd.getName(UTF16.charAt(chr, 0))); log.println(); @@ -2386,7 +2427,7 @@ F900..FAFF; CJK Compatibility Ideographs boolean lastOne = false; for (int i = 0; i < 0x10FFFF; ++i) { - boolean thisOne = UCA.isCJK(i) || UCA.isCJK_AB(i); + boolean thisOne = ucd.isCJK_BASE(i) || ucd.isCJK_AB(i); if (thisOne != lastOne) { summary.println("# Implicit Cusp: CJK=" + lastOne + ": " + Utility.hex(i-1) + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(i-1))); summary.println("# Implicit Cusp: CJK=" + thisOne + ": " + Utility.hex(i) + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(i))); @@ -2425,7 +2466,7 @@ F900..FAFF; CJK Compatibility Ideographs summary.print("# " + Utility.hex(i) + ": (" + Utility.hex(newval) + ") " + Utility.hex(sampleEq[i]) + " "); for (int q = 0; q < len; ++q) { - summary.print(UCA.ceToString(ces[q])); + summary.print(CEList.toString(ces[q])); } summary.println(" " + ucd.getName(sampleEq[i])); } @@ -2499,24 +2540,24 @@ F900..FAFF; CJK Compatibility Ideographs */ static int swapCJK(int i) { - if (i >= UCA.CJK_BASE) { - if (i < UCA.CJK_LIMIT) return i - UCA.CJK_BASE; + if (i >= CJK_BASE) { + if (i < CJK_LIMIT) return i - CJK_BASE; - if (i < UCA.CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET; + if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET; - if (i < UCA.CJK_COMPAT_USED_LIMIT) return i - UCA.CJK_COMPAT_USED_BASE - + (UCA.CJK_LIMIT - UCA.CJK_BASE); - if (i < UCA.CJK_B_BASE) return i + NON_CJK_OFFSET; + if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE + + (CJK_LIMIT - CJK_BASE); + if (i < CJK_B_BASE) return i + NON_CJK_OFFSET; - if (i < UCA.CJK_B_LIMIT) return i; // non-BMP-CJK + if (i < CJK_B_LIMIT) return i; // non-BMP-CJK return i + NON_CJK_OFFSET; // non-CJK } - if (i < UCA.CJK_A_BASE) return i + NON_CJK_OFFSET; + if (i < CJK_A_BASE) return i + NON_CJK_OFFSET; - if (i < UCA.CJK_A_LIMIT) return i - UCA.CJK_A_BASE - + (UCA.CJK_LIMIT - UCA.CJK_BASE) - + (UCA.CJK_COMPAT_USED_LIMIT - UCA.CJK_COMPAT_USED_BASE); + if (i < CJK_A_LIMIT) return i - CJK_A_BASE + + (CJK_LIMIT - CJK_BASE) + + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); return i + NON_CJK_OFFSET; // non-CJK } @@ -2642,14 +2683,14 @@ static int swapCJK(int i) { oldPrimary = newPrimary; } - showImplicit("# First CJK", UCA.CJK_BASE); - showImplicit("# Last CJK", UCA.CJK_LIMIT-1); - showImplicit("# First CJK-compat", UCA.CJK_COMPAT_USED_BASE); - showImplicit("# Last CJK-compat", UCA.CJK_COMPAT_USED_LIMIT-1); - showImplicit("# First CJK_A", UCA.CJK_A_BASE); - showImplicit("# Last CJK_A", UCA.CJK_A_LIMIT-1); - showImplicit("# First CJK_B", UCA.CJK_B_BASE); - showImplicit("# Last CJK_B", UCA.CJK_B_LIMIT-1); + showImplicit("# First CJK", CJK_BASE); + showImplicit("# Last CJK", CJK_LIMIT-1); + showImplicit("# First CJK-compat", CJK_COMPAT_USED_BASE); + showImplicit("# Last CJK-compat", CJK_COMPAT_USED_LIMIT-1); + showImplicit("# First CJK_A", CJK_A_BASE); + showImplicit("# Last CJK_A", CJK_A_LIMIT-1); + showImplicit("# First CJK_B", CJK_B_BASE); + showImplicit("# Last CJK_B", CJK_B_LIMIT-1); showImplicit("# First Other Implicit", 0); showImplicit("# Last Other Implicit", 0x10FFFF); @@ -2667,9 +2708,9 @@ static int swapCJK(int i) { // separate the three groups - if (UCA.isCJK(i) || UCA.CJK_COMPAT_USED_BASE <= i && i < UCA.CJK_COMPAT_USED_LIMIT) { + if (ucd.isCJK_BASE(i) || CJK_COMPAT_USED_BASE <= i && i < CJK_COMPAT_USED_LIMIT) { if (batch != 0) continue; - } else if (UCA.isCJK_AB(i)) { + } else if (ucd.isCJK_AB(i)) { if (batch != 1) continue; } else if (batch != 2) continue; @@ -2993,7 +3034,7 @@ static int swapCJK(int i) { for (char ch = 0; ch < 0xFFFF; ++ch) { byte type = collator.getCEType(ch); - if (type < UCA.FIXED_CE) { + if (type < FIXED_CE) { int len = collator.getCEs(String.valueOf(ch), true, ces); int primary = UCA.getPrimary(ces[0]); if (primary < variableHigh) continue; @@ -3088,36 +3129,22 @@ static int swapCJK(int i) { System.out.println("Sorting"); for (int i = 0; i <= 0xFFFF; ++i) { - if (EXCLUDE_UNSUPPORTED && !collator.found.get(i)) continue; + if (EXCLUDE_UNSUPPORTED && !collator.found.contains(i)) continue; if (0xD800 <= i && i <= 0xF8FF) continue; // skip surrogates and private use //if (0xA000 <= c && c <= 0xA48F) continue; // skip YI addString(UTF32.valueOf32(i), option); } - Hashtable multiTable = collator.getContracting(); - Enumeration enum = multiTable.keys(); - while (enum.hasMoreElements()) { - addString((String)enum.nextElement(), option); - } + + UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null); + cc.enableSamples(); - for (int i = 0; i < extraConformanceTests.length; ++i) { // put in sample non-characters - addString(extraConformanceTests[i], option); + while (true) { + String s = cc.next(); + if (s == null) break; + addString(s, option); } - - for (int i = 0; i < extraConformanceRanges.length; ++i) { - int start = extraConformanceRanges[i][0]; - int end = extraConformanceRanges[i][1]; - int increment = ((end - start + 1) / 303) + 1; - //System.out.println("Range: " + start + ", " + end + ", " + increment); - addString(start, option); - for (int j = start+1; j < end-1; j += increment) { - addString(j, option); - addString(j+1, option); - } - addString(end-1, option); - addString(end, option); - } - + System.out.println("Total: " + sortedD.size()); Iterator it; diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java b/tools/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java index 42de5a19ada..e1bbb9cca26 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java @@ -5,12 +5,14 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java,v $ -* $Date: 2002/05/31 01:41:03 $ -* $Revision: 1.7 $ +* $Date: 2002/06/15 02:47:12 $ +* $Revision: 1.8 $ * ******************************************************************************* */ +WARNING: OLD FILE. DON"T COMPILE. + package com.ibm.text.UCA; import java.util.*; @@ -21,6 +23,7 @@ import com.ibm.text.UCD.*; import com.ibm.text.utility.*; public class WriteHTMLCollation implements UCD_Types { + public static final String copyright = "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved."; @@ -74,8 +77,8 @@ public class WriteHTMLCollation implements UCD_Types { */ // DO FOLLOWING - writeConformance("CollationTest_NON_IGNORABLE.txt", UCA.NON_IGNORABLE); - writeConformance("CollationTest_SHIFTED.txt", UCA.SHIFTED); + //writeConformance("CollationTest_NON_IGNORABLE.txt", UCA.NON_IGNORABLE); + //writeConformance("CollationTest_SHIFTED.txt", UCA.SHIFTED); // SKIP BELOW if (true) return; @@ -178,7 +181,7 @@ public class WriteHTMLCollation implements UCD_Types { } return result.toString(); } - + /* static void writeConformance(String filename, byte option) throws IOException { PrintWriter log = Utility.openPrintWriter(filename); @@ -193,6 +196,7 @@ public class WriteHTMLCollation implements UCD_Types { addStringX(c, option); } + Hashtable multiTable = collator.getContracting(); Enumeration enum = multiTable.keys(); while (enum.hasMoreElements()) { @@ -248,7 +252,8 @@ public class WriteHTMLCollation implements UCD_Types { sortedD.clear(); System.out.println("Done"); } - + */ + static void addStringX(int x, byte option) { addStringX(String.valueOf((char)x), option); } @@ -382,7 +387,7 @@ public class WriteHTMLCollation implements UCD_Types { if (!arraysMatch(kenCes, kenLen, markCes, markLen)) { int kenCLen = fixCompatibilityCE(s, true, kenComp, true); - String comp = collator.ceToString(kenComp, kenCLen); + String comp = CEList.toString(kenComp, kenCLen); if (arraysMatch(kenCes, kenLen, kenComp, kenCLen)) { forLater.put((char)(COMPRESSED | type) + s, comp); @@ -422,10 +427,10 @@ public class WriteHTMLCollation implements UCD_Types { String comp = (String)forLater.get(key); int kenLen = collator.getCEs(s, decompType, kenCes); - String kenStr = collator.ceToString(kenCes, kenLen); + String kenStr = CEList.toString(kenCes, kenLen); int markLen = fixCompatibilityCE(s, true, markCes, false); - String markStr = collator.ceToString(markCes, markLen); + String markStr = CEList.toString(markCes, markLen); if ((type & COMPRESSED) != 0) { log.println("COMPRESSED #" + (++count) + ": " + ucd.getCodeAndName(s)); @@ -444,7 +449,7 @@ public class WriteHTMLCollation implements UCD_Types { log.println("NFD : " + ucd.getCodeAndName(nfdstr)); } //kenCLen = collator.getCEs(decomp, true, kenComp); - //log.println("decomp ce: " + collator.ceToString(kenComp, kenCLen)); + //log.println("decomp ce: " + CEList.toString(kenComp, kenCLen)); } log.println(); } @@ -569,7 +574,7 @@ public class WriteHTMLCollation implements UCD_Types { { int len2 = collator.getCEs("\u2474", true, ces); - System.out.println(UCA.ceToString(ces, len2)); + System.out.println(CEList.toString(ces, len2)); String a = collator.getSortKey("a"); String b = collator.getSortKey("A"); @@ -640,7 +645,7 @@ public class WriteHTMLCollation implements UCD_Types { else if (collator.getTertiary(ce) != collator.getTertiary(lastCE)) relation = " <<<"; lastCE = ce; if (chr.equals("\u2474")) { - System.out.println(UCA.ceToString(ces, len)); + System.out.println(CEList.toString(ces, len)); } // check expansions @@ -653,7 +658,7 @@ public class WriteHTMLCollation implements UCD_Types { int probe = ces[i]; String s = getFromBackMap(backMap, probe); if (s == null) { - System.out.println("No back map for " + collator.ceToString(ces[i]) + System.out.println("No back map for " + CEList.toString(ces[i]) + ": " + ucd.getCodeAndName(chr)); expansion += "[" + Utility.hex(ces[i]) + "]"; } else { @@ -943,7 +948,7 @@ public class WriteHTMLCollation implements UCD_Types { } if (sampleEq[sec] == null) sampleEq[sec] = chr; if (sampleEq[ter] == null) sampleEq[ter] = chr; - oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16); + oldStr.append(CEList.toString(ces[q]));// + "," + Integer.toString(ces[q],16); int np = primaryDelta[UCA.getPrimary(ces[q])]; hexBytes(np, newPrimary); hexBytes(fixSecondary(UCA.getSecondary(ces[q])), newSecondary); @@ -968,7 +973,7 @@ public class WriteHTMLCollation implements UCD_Types { } if (nonePrinted) { log.print("[,,]"); - oldStr.append(UCA.ceToString(0)); + oldStr.append(CEList.toString(0)); } log.println(" # " + oldStr + " # " + ucd.getName(chr.charAt(0))); lastChr = chr; @@ -1017,7 +1022,7 @@ public class WriteHTMLCollation implements UCD_Types { summary.print("# " + Utility.hex(i) + ": (" + Utility.hex(newval) + ") " + Utility.hex(sampleEq[i]) + " "); for (int q = 0; q < len; ++q) { - summary.print(UCA.ceToString(ces[q])); + summary.print(CEList.toString(ces[q])); } summary.println(" " + ucd.getName(sampleEq[i])); } @@ -1438,7 +1443,7 @@ public class WriteHTMLCollation implements UCD_Types { for (int i = 0; i <= 0xFFFF; ++i) { char c = (char)i; - if (EXCLUDE_UNSUPPORTED && !collator.found.get(c)) continue; + if (EXCLUDE_UNSUPPORTED && !collator.found.contains(c)) continue; if (0xD800 <= i && i <= 0xF8FF) continue; // skip surrogates and private use //if (0xA000 <= c && c <= 0xA48F) continue; // skip YI addString(String.valueOf(c), option); diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java index 81222011cdd..0ae1b3c835c 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Main.java +++ b/tools/unicodetools/com/ibm/text/UCD/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ -* $Date: 2002/06/13 21:14:05 $ -* $Revision: 1.15 $ +* $Date: 2002/06/15 02:47:14 $ +* $Revision: 1.16 $ * ******************************************************************************* */ @@ -63,6 +63,7 @@ public final class Main implements UCD_Types { else if (arg.equalsIgnoreCase("diffIgnorable")) VerifyUCD.diffIgnorable(); else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML(); else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed(); + else if (arg.equalsIgnoreCase("onetime")) VerifyUCD.oneTime(); else if (arg.equalsIgnoreCase("verifyNormalizationStability")) VerifyUCD.verifyNormalizationStability(); else if (arg.equalsIgnoreCase("definitionTransliterator")) GenerateHanTransliterator.main(0); diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java index aa4c04ca3bd..c533c1b313c 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ -* $Date: 2002/06/13 21:14:05 $ -* $Revision: 1.13 $ +* $Date: 2002/06/15 02:47:13 $ +* $Revision: 1.14 $ * ******************************************************************************* */ @@ -146,7 +146,7 @@ public final class UCD implements UCD_Types { * Get the character names for the code points in a string, separated by ", " */ public String getName(String s, byte style) { - if (s.length() == 1) return get(s.charAt(0), true).name; + if (s.length() == 1) return getName(s.charAt(0), style); StringBuffer result = new StringBuffer(); int cp; for (int i = 0; i < s.length(); i += UTF32.count16(cp)) { @@ -182,15 +182,15 @@ public final class UCD implements UCD_Types { /** * Get the name and number (U+xxxx NAME) for a code point */ - public String getCodeAndName(int codePoint) { - return getCode(codePoint) + " " + getName(codePoint); + public String getCodeAndName(int codePoint, byte type) { + return getCode(codePoint) + " " + getName(codePoint, type); } /** * Get the name and number (U+xxxx NAME) for the code points in a string, * separated by ", " */ - public String getCodeAndName(String s) { + public String getCodeAndName(String s, byte type) { if (s == null || s.length() == 0) return "NULL"; if (s.length() == 1) return getCodeAndName(s.charAt(0)); // fast path StringBuffer result = new StringBuffer(); @@ -203,6 +203,20 @@ public final class UCD implements UCD_Types { return result.toString(); } + /** + * Get the name and number (U+xxxx NAME) for a code point + */ + public String getCodeAndName(int codePoint) { + return getCodeAndName(codePoint, NORMAL); + } + + /** + * Get the name and number (U+xxxx NAME) for a code point + */ + public String getCodeAndName(String s) { + return getCodeAndName(s, NORMAL); + } + /** * Get the general category */ @@ -990,10 +1004,20 @@ to guarantee identifier closure. result = getRaw(codePoint); if (result == null) { result = UData.UNASSIGNED; - if (fixStrings) result.name = ""; + result.name = null; // clean this up, since we reuse UNASSIGNED + result.shortName = null; + if (fixStrings) { + result.name = ""; + } } - if (result.shortName != null && result.shortName.length() == 0) { - result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS); + if (fixStrings) { + if (result.name == null) { + result.name = ""; + System.out.println("Warning: fixing name for " + result.name); + } + if (result.shortName == null) { + result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS); + } } return result; case 0x3400: // CJK Ideograph Extension A @@ -1024,6 +1048,8 @@ to guarantee identifier closure. result = getRaw(rangeStart); if (result == null) { result = UData.UNASSIGNED; + result.name = null; // clean this up, since we reuse UNASSIGNED + result.shortName = null; if (fixStrings) { result.name = ""; result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS); @@ -1047,6 +1073,32 @@ to guarantee identifier closure. return result; } + // Neither Mapped nor Composite CJK: [\u3400-\u4DB5\u4E00-\u9FA5\U00020000-\U0002A6D6] + + public static final boolean isCJK_AB(int bigChar) { + return (CJK_A_BASE <= bigChar && bigChar < CJK_A_LIMIT + || CJK_B_BASE <= bigChar && bigChar < CJK_B_LIMIT); + } + + public static boolean isCJK_BASE(int cp) { + return (CJK_BASE <= cp && cp < CJK_LIMIT + || cp == 0xFA0E // compat characters that don't decompose. + || cp == 0xFA0F + || cp == 0xFA11 + || cp == 0xFA13 + || cp == 0xFA14 + || cp == 0xFA1F + || cp == 0xFA21 + || cp == 0xFA23 + || cp == 0xFA24 + || cp == 0xFA27 + || cp == 0xFA28 + || cp == 0xFA29 + || cp == 0xFA2E + || cp == 0xFA2F + ); + } + // Hangul constants public static final int @@ -1108,7 +1160,7 @@ to guarantee identifier closure. return 0xFFFF; // no composition } - static boolean isHangulSyllable(int char1) { + static public boolean isHangulSyllable(int char1) { return SBase <= char1 && char1 < SLimit; } diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java index 05b771caf4e..86fc89bd8f3 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $ -* $Date: 2002/05/29 02:01:00 $ -* $Revision: 1.12 $ +* $Date: 2002/06/15 02:47:13 $ +* $Revision: 1.13 $ * ******************************************************************************* */ @@ -21,8 +21,17 @@ public interface UCD_Types { public static final String UCD_DIR = BASE_DIR + "UCD\\"; public static final String BIN_DIR = BASE_DIR + "BIN\\"; public static final String GEN_DIR = BASE_DIR + "GEN\\"; - - + + public static final int + CJK_BASE = 0x4E00, + CJK_LIMIT = 0x9FFF+1, + CJK_COMPAT_USED_BASE = 0xFA0E, + CJK_COMPAT_USED_LIMIT = 0xFA2F+1, + CJK_A_BASE = 0x3400, + CJK_A_LIMIT = 0x4DBF+1, + CJK_B_BASE = 0x20000, + CJK_B_LIMIT = 0x2A6DF+1; + static final byte BINARY_FORMAT = 6; // bumped if binary format of UCD changes // Unicode Property Types diff --git a/tools/unicodetools/com/ibm/text/UCD/UData.java b/tools/unicodetools/com/ibm/text/UCD/UData.java index c5aff42663e..ae16a129eda 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UData.java +++ b/tools/unicodetools/com/ibm/text/UCD/UData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $ -* $Date: 2002/06/13 21:14:05 $ -* $Revision: 1.4 $ +* $Date: 2002/06/15 02:47:12 $ +* $Revision: 1.5 $ * ******************************************************************************* */ @@ -18,7 +18,7 @@ import com.ibm.text.utility.*; class UData implements UCD_Types { String name; - String shortName = ""; // cache + String shortName; // cache String decompositionMapping; String simpleUppercase; String simpleLowercase; diff --git a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java index 7f2cdd8e8ac..efb7bf12ce0 100644 --- a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $ -* $Date: 2002/06/13 21:14:05 $ -* $Revision: 1.15 $ +* $Date: 2002/06/15 02:47:12 $ +* $Revision: 1.16 $ * ******************************************************************************* */ @@ -27,6 +27,27 @@ import com.ibm.text.utility.*; import java.text.NumberFormat; public class VerifyUCD implements UCD_Types { + + static void oneTime() { + Default.setUCD(); + int[] testSet = {0x10000, 'a', 0xE0000, '\u0221'}; // 10000 + for (int i = 0; i < testSet.length; ++i) { + int item = testSet[i]; + System.out.println(Default.ucd.getCode(item)); + + boolean ass = Default.ucd.isAssigned(item); + System.out.println(ass ? " assigned" : " unassigned"); + ass = Default.ucd.isAllocated(item); + System.out.println(ass ? " allocated" : " unallocated"); + + String name = Default.ucd.getName(item, SHORT); + System.out.println(" " + name); + name = Default.ucd.getName(item); + System.out.println(" " + name); + + System.out.println(); + } + } static final byte NC = UNUSED_CATEGORY; diff --git a/tools/unicodetools/com/ibm/text/utility/IntStack.java b/tools/unicodetools/com/ibm/text/utility/IntStack.java index 5fdca1f1f9f..401a2dd3b85 100644 --- a/tools/unicodetools/com/ibm/text/utility/IntStack.java +++ b/tools/unicodetools/com/ibm/text/utility/IntStack.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/IntStack.java,v $ -* $Date: 2001/09/19 23:33:52 $ -* $Revision: 1.3 $ +* $Date: 2002/06/15 02:47:14 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -17,30 +17,65 @@ package com.ibm.text.utility; // Simple stack mechanism, with push, pop and access // ============================================================= -public final class IntStack implements Comparable { +public final class IntStack implements Comparable, Cloneable { private int[] values; private int top = 0; + private int first = 0; public IntStack(int initialSize) { values = new int[initialSize]; } + + public IntStack append(IntStack other) { + // TODO speed up by copying arrays + for (int i = 0; i < other.getTop(); ++i) { + push(other.get(i)); + } + return this; + } - public void push(int value) { + public IntStack append(int value) { + return push(value); + } + + public int length() { + return top - first; + } + + public IntStack push(int value) { if (top >= values.length) { // must grow? int[] temp = new int[values.length*2]; System.arraycopy(values,0,temp,0,values.length); values = temp; } values[top++] = value; + return this; } public int pop() { - if (top > 0) return values[--top]; + if (top > first) { + int result = values[--top]; + if (top == first && first > 0) { + top = first = 0; + } + return result; + } + throw new IllegalArgumentException("Stack underflow"); + } + + public int popFront() { + if (top > first) { + int result = values[first++]; + if (top == first) { + top = first = 0; + } + return result; + } throw new IllegalArgumentException("Stack underflow"); } public int get(int index) { - if (0 <= index && index < top) return values[index]; + if (first <= index && index < top) return values[index]; throw new IllegalArgumentException("Stack index out of bounds"); } @@ -49,22 +84,24 @@ public final class IntStack implements Comparable { } public boolean isEmpty() { - return top == 0; + return top - first == 0; } public void clear() { - top = 0; + top = first = 0; } public int compareTo(Object other) { IntStack that = (IntStack) other; - int min = top; - if (min < that.top) min = that.top; - for (int i = 0; i < min; ++i) { - int result = values[i] - that.values[i]; + int myLen = top - first; + int thatLen = that.top - that.first; + int limit = first + ((myLen < thatLen) ? myLen : thatLen); + int delta = that.first - first; + for (int i = first; i < limit; ++i) { + int result = values[i] - that.values[i + delta]; if (result != 0) return result; } - return top - that.top; + return myLen - thatLen; } public boolean equals(Object other) { @@ -73,9 +110,19 @@ public final class IntStack implements Comparable { public int hashCode() { int result = top; - for (int i = 0; i < top; ++i) { + for (int i = first; i < top; ++i) { result = result * 37 + values[i]; } return result; } + + public Object clone() { + try { + IntStack result = (IntStack) (super.clone()); + result.values = (int[]) result.values.clone(); + return result; + } catch (CloneNotSupportedException e) { + throw new IllegalArgumentException("Will never happen"); + } + } } \ No newline at end of file