diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java index 1b0b4e1b59b..728cc9896ba 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java +++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $ -* $Date: 2002/06/22 21:02:16 $ -* $Revision: 1.16 $ +* $Date: 2002/08/04 21:38:45 $ +* $Revision: 1.17 $ * ******************************************************************************* */ @@ -110,7 +110,7 @@ public final class DerivedProperty implements UCD_Types { + "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact." + "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { if (ucdData.getDecompositionType(cp) == NONE) return false; String norm = nfx.normalize(cp); if (UTF16.countCodePoint(norm) != 1) return true; @@ -133,7 +133,7 @@ public final class DerivedProperty implements UCD_Types { + "\r\n# Characters that are cc==0, BUT which may interact with previous characters." ; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { if (ucdData.getCombiningClass(cp) != 0) return false; String norm = nfx.normalize(cp); int first = UTF16.charAt(norm, 0); @@ -172,7 +172,7 @@ public final class DerivedProperty implements UCD_Types { + "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact." + "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { boolean result = bitset.get(cp); if (result && filter) { result = (ucdData.getCombiningClass(cp) != 0) == keepNonZero; @@ -243,7 +243,7 @@ public final class DerivedProperty implements UCD_Types { //if (cp >= 0xAC00 && cp <= 0xD7A3) return true; //System.out.println(Utility.hex(cps) + " => " + Utility.hex(nf[i-4].normalize(cps))); } // default - boolean hasValue(int cp) { return getValue(cp).length() != 0; } + public boolean hasValue(int cp) { return getValue(cp).length() != 0; } }; class CaseDProp extends UnicodeProperty { @@ -256,7 +256,7 @@ public final class DerivedProperty implements UCD_Types { header = "# Derived Property: " + name + "\r\n# Generated from: NFKD has >0 " + CaseNames[i-Missing_Uppercase] + ", no other cases"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { byte cat = ucdData.getCategory(cp); if (cat == val || val != Lt && ucdData.getBinaryProperty(cp, Other_Uppercase)) return false; @@ -294,7 +294,7 @@ public final class DerivedProperty implements UCD_Types { return getValue(cp, LONG); } - boolean hasValue(int cp) { return getValue(cp).length() != 0; } + public boolean hasValue(int cp) { return getValue(cp).length() != 0; } }; { @@ -323,7 +323,7 @@ public final class DerivedProperty implements UCD_Types { + "\r\n# Characters that can start an identifier." + "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { return ucdData.isIdentifierStart(cp, false); } }; @@ -338,7 +338,7 @@ public final class DerivedProperty implements UCD_Types { + "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc" + "\r\n# NOTE: Cf characters should be filtered out."; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { return ucdData.isIdentifierContinue_NO_Cf(cp, false); } }; @@ -354,7 +354,7 @@ public final class DerivedProperty implements UCD_Types { + "\r\n# NOTE: Does NOT remove the non-NFKx characters." + "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { return ucdData.isIdentifierStart(cp, true); } }; @@ -371,7 +371,7 @@ public final class DerivedProperty implements UCD_Types { + "\r\n# NOTE: Does NOT remove the non-NFKx characters." + "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { return ucdData.isIdentifierContinue_NO_Cf(cp, true); } }; @@ -384,7 +384,7 @@ public final class DerivedProperty implements UCD_Types { header = "# Derived Property: " + name + "\r\n# Generated from: Sm + Other_Math"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { byte cat = ucdData.getCategory(cp); if (cat == Sm || ucdData.getBinaryProperty(cp,Math_Property)) return true; @@ -400,7 +400,7 @@ public final class DerivedProperty implements UCD_Types { header = "# Derived Property: " + name + "\r\n# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { byte cat = ucdData.getCategory(cp); if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl || ucdData.getBinaryProperty(cp, Alphabetic)) return true; @@ -416,7 +416,7 @@ public final class DerivedProperty implements UCD_Types { header = "# Derived Property: " + name + "\r\n# Generated from: Ll + Other_Lowercase"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { byte cat = ucdData.getCategory(cp); if (cat == Ll || ucdData.getBinaryProperty(cp, Other_Lowercase)) return true; @@ -432,7 +432,7 @@ public final class DerivedProperty implements UCD_Types { header = "# Derived Property: " + name + "\r\n# Generated from: Lu + Other_Uppercase"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { byte cat = ucdData.getCategory(cp); if (cat == Lu || ucdData.getBinaryProperty(cp, Other_Uppercase)) return true; @@ -461,7 +461,7 @@ of characters, the first of which has a non-zero combining class. + ": Full Composition Exclusion" + "\r\n# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { if (!ucdData.isRepresented(cp)) return false; byte dtype = ucdData.getDecompositionType(cp); if (dtype != CANONICAL) return false; @@ -488,7 +488,7 @@ of characters, the first of which has a non-zero combining class. + ": Full Composition Inclusion" + "\r\n# characters with Canonical Decompositions MINUS Full Composition Exclusion"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { if (!ucdData.isRepresented(cp)) return false; byte dtype = ucdData.getDecompositionType(cp); if (dtype != CANONICAL) return false; @@ -516,7 +516,7 @@ of characters, the first of which has a non-zero combining class. if (c.equals(b)) return ""; return "FNC; " + Utility.hex(c); } // default - boolean hasValue(int cp) { return getValue(cp).length() != 0; } + public boolean hasValue(int cp) { return getValue(cp).length() != 0; } }; dprops[FC_NFC_Closure] = new UnicodeProperty() { @@ -538,7 +538,7 @@ of characters, the first of which has a non-zero combining class. if (c.equals(b)) return ""; return "FN; " + Utility.hex(c); } // default - boolean hasValue(int cp) { return getValue(cp).length() != 0; } + public boolean hasValue(int cp) { return getValue(cp).length() != 0; } }; for (int i = QuickNFD; i <= QuickNFKC; ++i) { @@ -555,7 +555,7 @@ of characters, the first of which has a non-zero combining class. + "\r\n# Generated from <2060..206F, FFF0..FFFB, E0000..E0FFF>" + "\r\n# + Other_Default_Ignorable_Code_Point + (Cf + Cc + Cs - White_Space)"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { if (0x2060 <= cp && cp <= 0x206F || 0xFFF0 <= cp && cp <= 0xFFFB || 0xE0000 <= cp && cp <= 0xE0FFF) return true; if (ucdData.getBinaryProperty(cp,Other_Default_Ignorable_Code_Point)) return true; if (ucdData.getBinaryProperty(cp, White_space)) return false; @@ -573,7 +573,7 @@ of characters, the first of which has a non-zero combining class. header = header = "# Binary Property"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { switch(cp) { case 0x27: case 0x2019: case 0xAD: return true; // case 0x2d: case 0x2010: case 0x2011: @@ -600,7 +600,7 @@ of characters, the first of which has a non-zero combining class. + "\r\n# - has no combining marks with zero canonical combining class" ; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { if (hasSoftDot(cp)) return true; if (Default.nfkd.isNormalized(cp)) return false; String decomp = Default.nfd.normalize(cp); @@ -629,7 +629,7 @@ of characters, the first of which has a non-zero combining class. header = header = "# Derived Property: " + name + "\r\n# Generated from: Other_Case_Ignorable + Lm + Mn + Me + Cf"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { byte cat = ucdData.getCategory(cp); if (cat == Lm || cat == Cf || cat == Mn || cat == Me) return true; if (dprops[Other_Case_Ignorable].hasValue(cp)) return true; @@ -654,7 +654,7 @@ of characters, the first of which has a non-zero combining class. + "\r\n# (CGJ = U+034F)"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { if (cp == 0x034F) return false; if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false; byte cat = ucdData.getCategory(cp); @@ -674,7 +674,7 @@ of characters, the first of which has a non-zero combining class. + "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp" + "\r\n# - Grapheme_Extend - Grapheme_Link - CGJ"; } - boolean hasValue(int cp) { + public boolean hasValue(int cp) { if (cp == 0x034F) return false; byte cat = ucdData.getCategory(cp); if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java index 13777bab0b6..54195ea7625 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $ -* $Date: 2002/07/30 09:56:41 $ -* $Revision: 1.8 $ +* $Date: 2002/08/04 21:38:45 $ +* $Revision: 1.9 $ * ******************************************************************************* */ @@ -275,6 +275,11 @@ public final class GenerateHanTransliterator implements UCD_Types { log = Utility.openPrintWriter("Transliterate_log.txt", Utility.UTF8_WINDOWS); log.print('\uFEFF'); + log.println(); + log.println("@*Override Data"); + log.println(); + readOverrides(type); + log.println(); log.println("@*DICT Data"); log.println(); @@ -426,7 +431,27 @@ public final class GenerateHanTransliterator implements UCD_Types { System.out.println("Defined Count: " + count); log.println(); - log.println("@Duplicates"); + log.println("@Duplicates (Frequency Order"); + log.println(); + it = rankList.iterator(); + while (it.hasNext()) { + String word = (String) it.next(); + Collection dups = (Collection) duplicates.get(word); + if (dups == null) continue; + log.print(hex.transliterate(word) + "\t" + word + "\t"); + Iterator it2 = dups.iterator(); + boolean gotFirst = false; + while (it2.hasNext()) { + if (!gotFirst) gotFirst = true; + else log.print(", "); + log.print(it2.next()); + } + if (overrideSet.contains(word)) log.print(" *override*"); + log.println(); + } + + log.println(); + log.println("@Duplicates (Character Order)"); log.println(); it = duplicates.keySet().iterator(); while (it.hasNext()) { @@ -440,6 +465,7 @@ public final class GenerateHanTransliterator implements UCD_Types { else log.print(", "); log.print(it2.next()); } + if (overrideSet.contains(word)) log.print(" *override*"); log.println(); } @@ -536,13 +562,19 @@ public final class GenerateHanTransliterator implements UCD_Types { int overallRank = 0; it = combinedRank.iterator(); - log.println(); - log.println("@Frequency data: Rank of Character"); - log.println(); + boolean showFrequency = false; + + if (showFrequency) { + log.println(); + log.println("@Frequency data: Rank of Character"); + log.println(); + } + + // make up rankMap, rankList while(it.hasNext()) { Pair p = (Pair) it.next(); - log.println(p.first + ", " + p.second); + if (showFrequency) log.println(p.first + ", " + p.second); Object rank = rankMap.get(p.second); if (rank == null) { rankMap.put(p.second, new Integer(++overallRank)); @@ -550,16 +582,18 @@ public final class GenerateHanTransliterator implements UCD_Types { } } - log.println(); - log.println("@Frequency data: Character to Rank"); - log.println(); - - // get full order - it = rankList.iterator(); - while (it.hasNext()) { - Comparable key = (Comparable) it.next(); - Comparable val = (Comparable) rankMap.get(key); - log.println(key + ", " + val); + if (showFrequency) { + log.println(); + log.println("@Frequency data: Character to Rank"); + log.println(); + + // get full order + it = rankList.iterator(); + while (it.hasNext()) { + Comparable key = (Comparable) it.next(); + Comparable val = (Comparable) rankMap.get(key); + log.println(key + ", " + val); + } } } catch (Exception e) { @@ -712,6 +746,38 @@ public final class GenerateHanTransliterator implements UCD_Types { } } + static void readOverrides(int type) throws IOException { + if (type != CHINESE) return; + String fname = "Chinese_override.txt"; + + System.out.println("Reading " + fname); + BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, true); + int counter = 0; + String[] pieces = new String[50]; + String line = ""; + try { + while (true) { + line = Utility.readDataLine(br); + if (line == null) break; + if (line.length() == 0) continue; + Utility.dot(counter++); + + // skip code + int wordStart = line.indexOf('\t') + 1; + int wordEnd = line.indexOf('\t', wordStart); + String word = line.substring(wordStart, wordEnd); + String definition = line.substring(wordEnd+1); + addCheck(word, definition, line); + overrideSet.add(word); + } + br.close(); + } catch (Exception e) { + throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e); + } + } + + static Set overrideSet = new HashSet(); + static void processEdict(String word, String definition, String line) { // We have a situation where we have words of the form CCCHHHKKKCCHHCCH > HHHHHHKKKHHHHHHHH // C = CJK, H = Hiragana, K = katakana diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java b/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java index 4dac9600f64..1976bedb10f 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $ -* $Date: 2002/07/30 09:57:18 $ -* $Revision: 1.1 $ +* $Date: 2002/08/04 21:38:45 $ +* $Revision: 1.2 $ * ******************************************************************************* */ @@ -21,61 +21,126 @@ import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; public class GenerateLineBreakTest implements UCD_Types { - - static String[] samples = new String[LB_LIMIT + 3]; - static byte[] TROrder = { + // COMMON STUFF for Hangul + static final byte hNot = -1, hL = 0, hV = 1, hT = 2, hLV = 3, hLVT = 4, hLIMIT = 5; + static final String[] hNames = {"L", "V", "T", "LV", "LVT"}; + + static byte getHangulType(int cp) { + if (Default.ucd.isLeadingJamo(cp)) return hL; + if (Default.ucd.isVowelJamo(cp)) return hV; + if (Default.ucd.isTrailingJamo(cp)) return hT; + if (Default.ucd.isHangulSyllable(cp)) { + if (Default.ucd.isDoubleHangul(cp)) return hLV; + return hLVT; + } + return hNot; + } + + //============================ + + protected String rule; + protected String fileName = "Line"; + + // all the other items are supplied in UCD_TYPES + static byte LB_L = LB_LIMIT + hL, LB_V = LB_LIMIT + hV, LB_T = LB_LIMIT + hT, + LB_LV = LB_LIMIT + hLV, LB_LVT = LB_LIMIT + hLVT, LB_SUP = LB_LIMIT + hLIMIT, + LB2_LIMIT = (byte)(LB_SUP + 1); + + String[] samples = new String[100]; + + + byte[] TypeOrder = { LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO, LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM, // missing from Pair Table LB_SP, LB_BK, LB_CR, LB_LF, // resolved types below LB_CB, LB_AI, LB_SA, LB_SG, LB_XX, - // 3 JAMO CLASSES - 29, 30, 31 + // 3 JAMO CLASSES, plus supplementary + LB_L, LB_V, LB_T, LB_LV, LB_LVT, LB_SUP }; - static final int TABLE_LIMIT = 25; - public static void main(String[] args) throws IOException { Default.setUCD(); + new GenerateLineBreakTest().run(); + new GenerateWordBreakTest().run(); + } + + // stuff that subclasses need to override + public void run() throws IOException { findSamples(); // test individual cases //printLine(out, samples[LB_ZW], "", samples[LB_CL]); //printLine(out, samples[LB_ZW], " ", samples[LB_CL]); - PrintWriter out = Utility.openPrintWriter("LineBreakTest.html", Utility.UTF8_WINDOWS); - out.println("

Current (fixed only for consistency):

"); + PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS); + out.println("" + + fileName + ""); + out.println("

Current (fixed only for consistency):

"); + + + generateTable(out, false); - out.println("

Recommended:

"); + out.println("

Recommended:

"); generateTable(out, true); out.println(""); out.close(); + String[] testCase = new String[50]; // do main test for (int k = 0; k < 2; ++k) { - out = Utility.openPrintWriter(k == 0 ? "LineBreakTest_SHORT.txt" : "LineBreakTest.txt", Utility.UTF8_WINDOWS); + out = Utility.openPrintWriter(fileName + (k == 0 ? "Test_SHORT.txt" : "Test.txt"), Utility.LATIN1_WINDOWS); int counter = 0; - out.println("# Default Linebreak conformance test"); - out.println("# " + Default.getDate() + ", MED"); + out.println("# Default " + fileName + " Break Test"); + out.println("# Generated: " + Default.getDate() + ", MED"); + out.println("#"); + out.println("# Format:"); + out.println("# (# )? "); + out.println("# contains hex Unicode code points, with "); + out.println("#\t" + BREAK + " wherever there is a break opportunity, and "); + out.println("#\t" + NOBREAK + " wherever there is not."); + out.println("# the format can change, but currently it shows:"); + out.println("#\t- the sample character name"); + out.println("#\t- (x) the line_break property* for the sample character"); + out.println("#\t- [x] the rule that determines whether there is a break or not"); + out.println("#"); + out.println("# Samples:"); + out.println("# The test currently takes all pairs of linebreak types*,"); + out.println("# picks a sample for each type, and generates three strings: "); + out.println("#\t- the pair alone"); + out.println("#\t- the pair alone with an imbeded space"); + out.println("#\t- the pair alone with embedded combining marks"); + out.println("# The sample for each type is simply the first code point (above NULL)"); + out.println("# with that property."); + out.println("# * Note:"); + out.println("#\t- SG is omitted"); + out.println("#\t- 3 different Jamo characters and a supplementary character are added"); + out.println("#\t The syllable types for the Jamo (L, V, T) are displayed in comments"); + out.println("#\t instead of the linebreak property"); + out.println("# These samples may be extended in the future."); out.println("#"); - for (int ii = 0; ii < samples.length; ++ii) { - int i = TROrder[ii]; + for (int ii = 0; ii < getLimit(); ++ii) { + int i = TypeOrder[ii]; + if (i == LB_SG) continue; String before = samples[i]; - for (int jj = 0; jj < samples.length; ++jj) { - Utility.dot(counter++); - int j = TROrder[jj]; + for (int jj = 0; jj < getLimit(); ++jj) { + Utility.dot(counter); + int j = TypeOrder[jj]; + if (j == LB_SG) continue; String after = samples[j]; // do line straight - printLine(out, before, "", after, k != 0); - printLine(out, before, " ", after, k != 0); - printLine(out, before, "\u0301\u0308", after, k != 0); + int len = genTestItems(before, after, testCase); + for (int q = 0; q < len; ++q) { + printLine(out, testCase[q], k != 0 && q == 0, false); + ++counter; + } } } out.println("# Lines: " + counter); @@ -83,25 +148,80 @@ public class GenerateLineBreakTest implements UCD_Types { } } - public static void generateTable(PrintWriter out, boolean recommended) { - out.print(""); - for (int i = 0; i < TABLE_LIMIT; ++i) { - String h = getLBID(samples[TROrder[i]]); - out.print(""); + // stuff that subclasses need to override + public int genTestItems(String before, String after, String[] results) { + results[0] = before + after; + results[1] = before + " " + after; + results[2] = before + "\u0301\u0308" + after; + return 3; + } + + // stuff that subclasses need to override + boolean skipType(byte type) { + return type == LB_AI || type == LB_SA || type == LB_SG || type == LB_XX; + } + + // stuff that subclasses need to override + public String getTypeID(int cp) { + byte result = getType(cp); + if (result == LB_SUP) return "SUP"; + if (result >= LB_LIMIT) return hNames[result - LB_LIMIT]; + return Default.ucd.getLineBreakID_fromIndex(result); + } + + // stuff that subclasses need to override + public byte getType(int cp) { + if (cp > 0xFFFF) return LB_SUP; + byte result = getHangulType(cp); + if (result != hNot) return (byte)(result + LB_LIMIT); + return Default.ucd.getLineBreak(cp); + } + + public int getLimit() { + return LB2_LIMIT; + } + + public int getTableLimit() { + return LB_SUP; // skip last; + } + + + public void generateTable(PrintWriter out, boolean recommended) { + String width = "width='" + (100 / (getTableLimit() + 1)) + "%'"; + out.print("
" + h + "
"); + byte type; + for (int i = 0; i < getTableLimit(); ++i) { + type = TypeOrder[i]; + if (skipType(type)) continue; + + String h = getTypeID(samples[TypeOrder[i]]); + out.print(""); } out.print(""); String[] rule = new String[1]; String[] rule2 = new String[1]; - for (int i = 0; i < TABLE_LIMIT; ++i) { - String before = samples[TROrder[i]]; - String line = ""; - for (int j = 0; j < TABLE_LIMIT; ++j) { - String after = samples[TROrder[j]]; + for (int i = 0; i < getTableLimit(); ++i) { + type = TypeOrder[i]; + if (skipType(type)) continue; + + String before = samples[type]; + String line = ""; + for (int j = 0; j < getTableLimit(); ++j) { + type = TypeOrder[j]; + if (skipType(type)) continue; + + String after = samples[type]; String t = getTableEntry(before, after, recommended, rule); String background = ""; - if (recommended) { - String t2 = getTableEntry(before, after, false, rule2); - if (!t.equals(t2)) background = " bgcolor='#FFFF00'"; + String t2 = getTableEntry(before, after, !recommended, rule2); + if (!t.equals(t2)) { + if (t.equals(NOBREAK)) { + background = " bgcolor='#CCFFFF'"; + } else { + background = " bgcolor='#FFFF00'"; + } + } else if (t.equals(NOBREAK)) { + background = " bgcolor='#CCCCFF'"; } line += ""; } @@ -110,7 +230,7 @@ public class GenerateLineBreakTest implements UCD_Types { out.println("
" + h + "
" + getLBID(before) + "
" + getTypeID(before) + "" + t + "
"); } - public static String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) { + public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) { String t = "_"; boolean spaceBreak = isBreak(before + " " + after, before.length() + 1, recommended); String spaceRule = rule; @@ -137,75 +257,83 @@ public class GenerateLineBreakTest implements UCD_Types { return t; } + static final String BREAK = "\u00F7"; + static final String NOBREAK = "\u00D7"; - public static void printLine(PrintWriter out, String before, String filler, String after, boolean comments) { - String s = before + filler + after; - int offset = before.length() + filler.length(); + public void printLine(PrintWriter out, String source, boolean comments, boolean recommended) { + int cp; + StringBuffer string = new StringBuffer(); + StringBuffer comment = new StringBuffer("\t# "); + String status = isBreak(source, 0, recommended) ? BREAK : NOBREAK; + string.append(status); + comment.append(' ').append(status).append(" [").append(rule).append(']'); - boolean lb = isBreak(s, offset, false); - - String tlb = (lb ? "b" : "n"); - String comment = ""; - if (comments) comment = - " # " + getLBID(before + filler) - + " " + tlb - + " " + getLBID(after) - + " # " + Default.ucd.getName(before + filler) - + " " + tlb - + " " + Default.ucd.getName(after); + for (int offset = 0; offset < source.length(); offset += UTF16.getCharCount(cp)) { - out.println(Utility.hex(before + filler) - + "; " + tlb - + "; " + Utility.hex(after) - + comment); + cp = UTF16.charAt(source, offset); + if (string.length() > 0) { + string.append(' '); + comment.append(' '); + } + + string.append(Utility.hex(cp)); + comment.append(Default.ucd.getName(cp) + " (" + getTypeID(cp) + ")"); + + status = isBreak(source, offset + UTF16.getCharCount(cp), recommended) ? BREAK : NOBREAK; + string.append(' ').append(status); + comment.append(' ').append(status).append(" [").append(rule).append(']'); + } + + if (comments) string.append(comment); + out.println(string); } - - public static void findSamples() { + + public void findSamples() { for (int i = 1; i <= 0x10FFFF; ++i) { if (!Default.ucd.isAllocated(i)) continue; - if (Default.ucd.isLeadingJamo(i) - || Default.ucd.isVowelJamo(i) - || Default.ucd.isTrailingJamo(i)) continue; - byte lb = Default.ucd.getLineBreak(i); + if (0xD800 <= i && i <= 0xDFFF) continue; + if(i == 0x1100) { + System.out.print("here"); + } + byte lb = getType(i); if (samples[lb] == null) { samples[lb] = UTF16.valueOf(i); } } - // fill the last with special cases - samples[LB_LIMIT] = "\u1100"; - samples[LB_LIMIT+1] = "\u1162"; - samples[LB_LIMIT+2] = "\u11A8"; + for (int i = 0; i < TypeOrder.length; ++i) { + String sample = samples[i]; + System.out.println(getTypeID(sample) + ":\t" + Default.ucd.getCodeAndName(sample)); + } } - public static String getLBID(String s) { - if (s.length() == 1) return Default.ucd.getLineBreakID(s.charAt(0)); + public String getTypeID(String s) { + if (s == null) return ""; + if (s.length() == 1) return getTypeID(s.charAt(0)); StringBuffer result = new StringBuffer(); int cp; for (int i = 0; i < s.length(); i += UTF32.count16(cp)) { cp = UTF32.char32At(s, i); if (i > 0) result.append(" "); - result.append(Default.ucd.getLineBreakID(cp)); + result.append(getTypeID(cp)); } return result.toString(); } - static String rule; - - public static int findLastNon(String source, int offset, byte notLBType) { + public int findLastNon(String source, int offset, byte notLBType, boolean recommended) { int cp; - for (int i = offset-2; i >= 0; i -= UTF16.getCharCount(cp)) { + for (int i = offset-1; i >= 0; i -= UTF16.getCharCount(cp)) { cp = UTF16.charAt(source, i); - byte f = getResolvedLB(cp); - if (f != notLBType) return cp; + byte f = getResolvedType(cp, recommended); + if (f != notLBType) return i; } - return 0; + return -1; } - public static byte getResolvedLB (int cp) { + public byte getResolvedType (int cp, boolean recommended) { // LB 1 Assign a line break category to each character of the input. // Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm. - byte result = Default.ucd.getLineBreak(cp); + byte result = getType(cp); switch (result) { case LB_AI: result = LB_AI; break; // case LB_CB: result = LB_ID; break; @@ -213,17 +341,31 @@ public class GenerateLineBreakTest implements UCD_Types { // case LB_SG: result = LB_XX; break; Surrogates; will never occur case LB_XX: result = LB_AL; break; } + if (recommended) { + if (getHangulType(cp) != hNot) { + result = LB_ID; + } + } + return result; } - + + public boolean onCodepointBoundary(String s, int offset) { + if (offset < 0 || offset > s.length()) return false; + if (offset == 0 || offset == s.length()) return true; + if (UTF16.isLeadSurrogate(s.charAt(offset-1)) + && UTF16.isTrailSurrogate(s.charAt(offset))) return false; + return true; + } + // find out whether there is a break at offset // WARNING: as a side effect, sets "rule" - public static boolean isBreak(String source, int offset, boolean recommended) { + public boolean isBreak(String source, int offset, boolean recommended) { // LB 1 Assign a line break category to each character of the input. // Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm. - // this is taken care of in the getResolvedLB function + // this is taken care of in the getResolvedType function // LB 2a Never break at the start of text @@ -237,8 +379,7 @@ public class GenerateLineBreakTest implements UCD_Types { // UTF-16: never break in the middle of a code point - if (UTF16.isLeadSurrogate(source.charAt(offset-1)) - && UTF16.isTrailSurrogate(source.charAt(offset))) return false; + if (!onCodepointBoundary(source, offset)) return false; // now get the character before and after, and their types @@ -247,8 +388,8 @@ public class GenerateLineBreakTest implements UCD_Types { int cpBefore = UTF16.charAt(source, offset-1); int cpAfter = UTF16.charAt(source, offset); - byte before = getResolvedLB(cpBefore); - byte after = getResolvedLB(cpAfter); + byte before = getResolvedType(cpBefore, recommended); + byte after = getResolvedType(cpAfter, recommended); rule="3a"; @@ -276,22 +417,21 @@ public class GenerateLineBreakTest implements UCD_Types { // LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos. rule="6"; if (after == LB_CM) return false; - if (Default.ucd.isLeadingJamo(cpBefore)) { - if (Default.ucd.isLeadingJamo(cpAfter) || Default.ucd.isVowelJamo(cpAfter)) return false; - } else if (Default.ucd.isVowelJamo(cpBefore)) { - if (Default.ucd.isVowelJamo(cpAfter) || Default.ucd.isTrailingJamo(cpAfter)) return false; - } else if (Default.ucd.isTrailingJamo(cpBefore)) { - if (Default.ucd.isTrailingJamo(cpAfter)) return false; - } - + + if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false; + + if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false; + + if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false; + boolean setBase = false; if (before == LB_CM) { setBase = true; - int cp = findLastNon(source, offset, LB_CM); - if (cp == 0) { + int backOffset = findLastNon(source, offset, LB_CM, recommended); + if (backOffset < 0) { before = LB_ID; } else { - before = getResolvedLB(cp); + before = getResolvedType(UTF16.charAt(source, backOffset), recommended); } } @@ -310,9 +450,9 @@ public class GenerateLineBreakTest implements UCD_Types { // find the last non-space character; we will need it byte lastNonSpace = before; if (lastNonSpace == LB_SP) { - int cp = findLastNon(source, offset, LB_CM); - if (cp != 0) { - lastNonSpace = getResolvedLB(cp); + int backOffset = findLastNon(source, offset, LB_CM, recommended); + if (backOffset >= 0) { + lastNonSpace = getResolvedType(UTF16.charAt(source, backOffset), recommended); } } @@ -476,4 +616,162 @@ public class GenerateLineBreakTest implements UCD_Types { rule="20"; return true; } + + static class GenerateWordBreakTest extends GenerateLineBreakTest { + + static final byte CR = 0, LF = 1, Control = 2, Extend = 3, Link = 4, CGJ = 5, Base = 6, LetterBase = 7, Other = 8, + oLIMIT = 9, // RESET THIS IF LIST ABOVE CHANGES! + L = oLIMIT + hL, V = oLIMIT + hV, T = oLIMIT + hT, LV = oLIMIT + hLV, LVT = oLIMIT + hLVT, + LIMIT = LVT + 1; + + static final String[] Names = {"CR", "LF", "CTL", "Extend", "Link", "CGJ", "Base", "LetterBase", "Other" }; + + static UnicodeProperty extendProp = UnifiedBinaryProperty.make(DERIVED | GraphemeExtend); + static UnicodeProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase); + static UnicodeProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink); + + { + fileName = "Word"; + TypeOrder = new byte[LIMIT]; + for (byte i = 0; i < TypeOrder.length; ++i) { + TypeOrder[i] = i; + } + } + + boolean skipType(byte type) { + return false; + } + + public int getLimit() { + return LIMIT; + } + + public int getTableLimit() { + return LIMIT; + } + + // stuff that subclasses need to override + public int genTestItems(String before, String after, String[] results) { + results[0] = before + after; + return 1; + } + + public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) { + boolean normalBreak = isBreak(before + after, before.length(), recommended); + String normalRule = rule; + ruleOut[0] = rule; + return normalBreak ? BREAK : NOBREAK; + } + + // stuff that subclasses need to override + public String getTypeID(int cp) { + byte type = getType(cp); + if (type >= oLIMIT) return hNames[type - oLIMIT]; + return Names[type]; + } + + // stuff that subclasses need to override + public byte getType(int cp) { + // single characters + if (cp == 0xA) return LF; + if (cp == 0xD) return CR; + if (cp == 0x034F) return CGJ; + if (cp == 0x2028 || cp == 0x2029) return Control; + + // Hangul + byte result = getHangulType(cp); + if (result != hNot) return (byte)(result + oLIMIT); + + // other properties + // category based + byte cat = Default.ucd.getCategory(cp); + if (cat == Cc) return Control; + if (cat == Cf) return Extend; + if (((1< source.length()) return false; + if (offset == 0) return true; + + rule = "2"; + if (offset == source.length()) return true; + + // UTF-16: never break in the middle of a code point + if (!onCodepointBoundary(source, offset)) return false; + + // now get the character before and after, and their types + + + int cpBefore = UTF16.charAt(source, offset-1); + int cpAfter = UTF16.charAt(source, offset); + + byte before = getResolvedType(cpBefore, recommended); + byte after = getResolvedType(cpAfter, recommended); + + rule = "3"; + if (before == CR && after == LF) return false; + + rule = "4"; + if (before == CR || before == LF || before == Control + || after == Control || after == LF || after == CR) return true; + + rule = "6"; + if (before == L && (after == L || after == V || after == LV || after == LVT)) return false; + + rule = "7"; + if ((before == LV || before == V) && (after == V || after == T)) return false; + + rule = "8"; + if ((before == LVT || before == T) && (after == T)) return false; + + rule = "9"; + if (after == Extend) return false; + + if (recommended) { + if (after == Link || after == CGJ) return false; + } else { + + // Do not break around a CGJ. + rule = "10"; + if (before == CGJ && (after == Base + || after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT)) return false; + rule = "11"; + if (after == CGJ) return false; + + // Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together. + + rule = "12"; + //Link Extend* × LetterBase (12) + if (after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT) { + int backOffset = findLastNon(source, offset, Extend, recommended); + if (backOffset >= 0) { + byte last = getResolvedType(UTF16.charAt(source, backOffset), recommended); + if (last == Link) return false; + } + } + + rule = "13"; + if (after == Link) return false; + } + + // Otherwise break after all characters. + rule = "14"; + return true; + + } + + } } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java b/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java index 250266f9923..56f2e0fdc4d 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java,v $ -* $Date: 2002/07/30 09:56:41 $ -* $Revision: 1.2 $ +* $Date: 2002/08/04 21:38:45 $ +* $Revision: 1.3 $ * ******************************************************************************* */ @@ -23,20 +23,23 @@ public class GenerateThaiBreaks { BufferedReader br = new BufferedReader( new InputStreamReader( - new FileInputStream("\\icu4j\\src\\data\\thai6.ucs"), "UnicodeLittle")); + new FileInputStream("c:\\icu4j\\src\\com\\ibm\\icu\\dev\\data\\thai6.ucs"), "UnicodeLittle")); PrintWriter out = null; try { Default.setUCD(); - UnicodeSet ignorables = new UnicodeSet(0xE30, 0xE3A); + UnicodeSet ignorables = new UnicodeSet(); + /* new UnicodeSet(0xE30, 0xE3A); ignorables.add(0x0E40, 0x0E44); // add logical order exception ignorables.add(0x0E47, 0x0E4E); + */ ignorables.add(0, ' '); // add controls ignorables.add('.'); - Set initials = new TreeSet(); - Set finals = new TreeSet(); - Set medials = new TreeSet(); + + UnicodeSet initials = new UnicodeSet(); + UnicodeSet finals = new UnicodeSet(); + UnicodeSet medials = new UnicodeSet(); char[] buffer = new char[100]; @@ -60,34 +63,58 @@ public class GenerateThaiBreaks { } initials.add(temp.substring(0,1)); - initials.add(temp.substring(0,2)); - finals.add(temp.substring(temp.length()-2)); + //initials.add(temp.substring(0,2)); finals.add(temp.substring(temp.length()-1)); + //finals.add(temp.substring(temp.length()-1)); - for (int i = 1; i < temp.length() - 3; ++i) { - medials.add(temp.substring(i, i+2)); + for (int i = 1; i < temp.length() - 1; ++i) { + //medials.add(temp.substring(i, i+2)); medials.add(temp.substring(i, i+1)); } - medials.add(temp.substring(temp.length() - 2, temp.length() - 1)); + //medials.add(temp.substring(temp.length() - 2, temp.length() - 1)); } System.out.println("initials size: " + initials.size()); System.out.println("finals size: " + finals.size()); System.out.println("medials size: " + medials.size()); + //out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS); + // out.write('\uFEFF'); + + UnicodeSet marks = new UnicodeSet("[[\u0e00-\u0e7f]&[[:mn:][:me:]]]"); + finals.addAll(marks); + + UnicodeSet all = new UnicodeSet(initials).addAll(medials).addAll(finals); + + UnicodeSet missingThai = new UnicodeSet("[[\u0e00-\u0e7f]-[:Cn:]]").removeAll(all); + + System.out.println("Never occur: " + missingThai.toPattern(true)); + Utility.showSetNames("", missingThai, true, Default.ucd); + System.out.println(); + + UnicodeSet neverInitial = new UnicodeSet(all).removeAll(initials); + UnicodeSet neverFinal = new UnicodeSet(all).removeAll(finals); + + System.out.println("Never initial: " + neverInitial.toPattern(true)); + Utility.showSetNames("", neverInitial, true, Default.ucd); + System.out.println(); + + System.out.println("Never final: " + neverFinal.toPattern(true)); + Utility.showSetNames("", neverFinal, true, Default.ucd); + System.out.println(); + initials.removeAll(medials); finals.removeAll(medials); System.out.println("initials size: " + initials.size()); System.out.println("finals size: " + finals.size()); - out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS); - out.write('\uFEFF'); - out.println("Only Initials"); - Utility.print(out, initials, ", ", new MyBreaker()); - out.println(); - out.println("Only Finals"); - Utility.print(out, finals, ", ", new MyBreaker()); + System.out.println("Only Initials" + initials.toPattern(true)); + Utility.showSetNames("", initials, true, Default.ucd); + System.out.println(); + + System.out.println("Only Finals" + finals.toPattern(true)); + Utility.showSetNames("", finals, true, Default.ucd); } finally { br.close(); if (out != null) out.close(); diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java index 40dfef6f186..68b630d8457 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Main.java +++ b/tools/unicodetools/com/ibm/text/UCD/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ -* $Date: 2002/07/30 09:56:41 $ -* $Revision: 1.19 $ +* $Date: 2002/08/04 21:38:45 $ +* $Revision: 1.20 $ * ******************************************************************************* */ @@ -78,7 +78,7 @@ public final class Main implements UCD_Types { else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null); - else if (arg.equalsIgnoreCase("linebreaktest")) GenerateLineBreakTest.main(null); + else if (arg.equalsIgnoreCase("breaktest")) GenerateBreakTest.main(null); else if (arg.equalsIgnoreCase("genSplit")) GenerateData.genSplit(); else if (arg.equalsIgnoreCase("iana")) IANANames.testSensitivity(); diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java index 2e6adf2cdbd..324ea0a7f43 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ -* $Date: 2002/07/30 09:56:40 $ -* $Revision: 1.16 $ +* $Date: 2002/08/04 21:38:45 $ +* $Revision: 1.17 $ * ******************************************************************************* */ @@ -737,6 +737,10 @@ public final class UCD implements UCD_Types { return UCD_Names.NT[prop]; } + public static String getNumericTypeID_fromIndex(byte prop, byte style) { + return style == SHORT ? UCD_Names.SHORT_NT[prop] : UCD_Names.NT[prop]; + } + public String getEastAsianWidthID(int codePoint) { return getEastAsianWidthID_fromIndex(getEastAsianWidth(codePoint)); } @@ -745,6 +749,10 @@ public final class UCD implements UCD_Types { return UCD_Names.EA[prop]; } + public static String getEastAsianWidthID_fromIndex(byte prop, byte style) { + return style != LONG ? UCD_Names.SHORT_EA[prop] : UCD_Names.EA[prop]; + } + public String getLineBreakID(int codePoint) { return getLineBreakID_fromIndex(getLineBreak(codePoint)); } @@ -753,6 +761,10 @@ public final class UCD implements UCD_Types { return UCD_Names.LB[prop]; } + public static String getLineBreakID_fromIndex(byte prop, byte style) { + return style != LONG ? UCD_Names.LB[prop] : UCD_Names.LONG_LB[prop]; + } + public String getJoiningTypeID(int codePoint) { return getJoiningTypeID_fromIndex(getJoiningType(codePoint)); } @@ -761,6 +773,10 @@ public final class UCD implements UCD_Types { return UCD_Names.JOINING_TYPE[prop]; } + public static String getJoiningTypeID_fromIndex(byte prop, byte style) { + return style != LONG ? UCD_Names.JOINING_TYPE[prop] : UCD_Names.LONG_JOINING_TYPE[prop]; + } + public String getJoiningGroupID(int codePoint) { return getJoiningGroupID_fromIndex(getJoiningGroup(codePoint)); } @@ -769,6 +785,11 @@ public final class UCD implements UCD_Types { return UCD_Names.JOINING_GROUP[prop]; } + public static String getJoiningGroupID_fromIndex(byte prop, byte style) { + // no short version + return UCD_Names.JOINING_GROUP[prop]; + } + public String getScriptID(int codePoint) { return getScriptID_fromIndex(getScript(codePoint)); } @@ -790,6 +811,11 @@ public final class UCD implements UCD_Types { return UCD_Names.AGE[prop]; } + public static String getAgeID_fromIndex(byte prop, byte style) { + // no short for + return UCD_Names.AGE[prop]; + } + public String getBinaryPropertiesID(int codePoint, byte bit) { return (getBinaryProperties(codePoint) & (1<>8: return ucd.getCombiningClassID_fromIndex((byte)propValue, style); case BIDI_CLASS>>8: return ucd.getBidiClassID_fromIndex((byte)propValue, style); case DECOMPOSITION_TYPE>>8: return ucd.getDecompositionTypeID_fromIndex((byte)propValue, style); - case NUMERIC_TYPE>>8: if (propValue >= LIMIT_NUMERIC_TYPE) break; - if (style != SHORT) return ucd.getNumericTypeID_fromIndex((byte)propValue); - return UCD_Names.SHORT_NT[propValue]; - case EAST_ASIAN_WIDTH>>8: if (propValue >= LIMIT_EAST_ASIAN_WIDTH) break; - if (style != LONG) return ucd.getEastAsianWidthID_fromIndex((byte)propValue); - return UCD_Names.SHORT_EA[propValue]; - case LINE_BREAK>>8: if (propValue >= LIMIT_LINE_BREAK) break; - if (style != LONG) return ucd.getLineBreakID_fromIndex((byte)propValue); - return UCD_Names.LONG_LB[propValue]; - case JOINING_TYPE>>8: if (propValue >= LIMIT_JOINING_TYPE) break; - if (style != LONG) return ucd.getJoiningTypeID_fromIndex((byte)propValue); - return UCD_Names.LONG_JOINING_TYPE[propValue]; - case JOINING_GROUP>>8: if (propValue >= LIMIT_JOINING_GROUP) break; - return ucd.getJoiningGroupID_fromIndex((byte)propValue); + case NUMERIC_TYPE>>8: ucd.getNumericTypeID_fromIndex((byte)propValue, style); + case EAST_ASIAN_WIDTH>>8: return ucd.getEastAsianWidthID_fromIndex((byte)propValue); + case LINE_BREAK>>8: return ucd.getLineBreakID_fromIndex((byte)propValue, style); + case JOINING_TYPE>>8: return ucd.getJoiningTypeID_fromIndex((byte)propValue); + case JOINING_GROUP>>8: return ucd.getJoiningGroupID_fromIndex((byte)propValue); case BINARY_PROPERTIES>>8: return ucd.getBinaryPropertiesID_fromIndex((byte)propValue, style); - case SCRIPT>>8: if (propValue >= LIMIT_SCRIPT) break; - if (style != SHORT) return ucd.getScriptID_fromIndex((byte)propValue); - return UCD_Names.ABB_SCRIPT[propValue]; - case AGE>>8: if (propValue >= LIMIT_AGE) break; - return ucd.getAgeID_fromIndex((byte)propValue); + case SCRIPT>>8: return ucd.getScriptID_fromIndex((byte)propValue); + case AGE>>8: return ucd.getAgeID_fromIndex((byte)propValue); /* case DERIVED>>8: UnicodeProperty up = DerivedProperty.make(propValue, ucd); diff --git a/tools/unicodetools/com/ibm/text/utility/Utility.java b/tools/unicodetools/com/ibm/text/utility/Utility.java index 58c89c09797..67fc9fcb655 100644 --- a/tools/unicodetools/com/ibm/text/utility/Utility.java +++ b/tools/unicodetools/com/ibm/text/utility/Utility.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $ -* $Date: 2002/07/30 09:56:41 $ -* $Revision: 1.23 $ +* $Date: 2002/08/04 21:38:44 $ +* $Revision: 1.24 $ * ******************************************************************************* */ @@ -17,9 +17,10 @@ import java.util.*; import java.text.*; import java.io.*; import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UTF16; import com.ibm.text.UCD.*; -public final class Utility { // COMMON UTILITIES +public final class Utility implements UCD_Types { // COMMON UTILITIES static final boolean UTF8 = true; // TODO -- make argument @@ -470,7 +471,22 @@ public final class Utility { // COMMON UTILITIES return quoteXML(source, false); } - + private static UnicodeProperty defaultIgnorable = null; + + public static String getDisplay(int cp) { + String result = UTF16.valueOf(cp); + byte cat = Default.ucd.getCategory(cp); + if (cat == Mn || cat == Me) { + result = String.valueOf(DOTTED_CIRCLE) + result; + } else if (cat == Cf || cat == Cc || cp == 0x034F || cp == 0x00AD || cp == 0x1806) { + result = "\u25A1"; + } else { + if (defaultIgnorable == null) defaultIgnorable = DerivedProperty.make(DefaultIgnorable); + if (defaultIgnorable.hasValue(cp)) result = "\u25A1"; + } + return result; + } + public static int compare(char[] a, int aStart, int aEnd, char[] b, int bStart, int bEnd) { while (aStart < aEnd && bStart < bEnd) { int diff = a[aStart++] - b[bStart++];