From 5a41a000bdfe6ad59660cec3a74df2bdc6195ea8 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Thu, 8 Aug 2002 15:38:16 +0000 Subject: [PATCH] linebreak testing X-SVN-Rev: 9616 --- .../com/ibm/text/UCD/GenerateBreakTest.java | 1446 +++++++++++++++++ .../com/ibm/text/UCD/IANANames.java | 177 ++ .../com/ibm/text/UCD/UnifiedProperty.java | 187 +++ 3 files changed, 1810 insertions(+) create mode 100644 tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java create mode 100644 tools/unicodetools/com/ibm/text/UCD/IANANames.java create mode 100644 tools/unicodetools/com/ibm/text/UCD/UnifiedProperty.java diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java new file mode 100644 index 00000000000..a7f1b65ed1b --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java @@ -0,0 +1,1446 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $ +* $Date: 2002/08/08 15:38:15 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCD; + +import java.util.*; +import java.io.*; + +import com.ibm.text.utility.*; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; + +abstract public class GenerateBreakTest implements UCD_Types { + + static boolean DEBUG = false; + + // COMMON STUFF for Hangul + static final byte hNot = -1, hL = 0, hV = 1, hT = 2, hLV = 3, hLVT = 4, hLIMIT = 5; + static final String[] hNames = {"L", "V", "T", "LV", "LVT"}; + + static byte getHangulType(int cp) { + if (Default.ucd.isLeadingJamo(cp)) return hL; + if (Default.ucd.isVowelJamo(cp)) return hV; + if (Default.ucd.isTrailingJamo(cp)) return hT; + if (Default.ucd.isHangulSyllable(cp)) { + if (Default.ucd.isDoubleHangul(cp)) return hLV; + return hLVT; + } + return hNot; + } + + public static boolean onCodepointBoundary(String s, int offset) { + if (offset < 0 || offset > s.length()) return false; + if (offset == 0 || offset == s.length()) return true; + if (UTF16.isLeadSurrogate(s.charAt(offset-1)) + && UTF16.isTrailSurrogate(s.charAt(offset))) return false; + return true; + } + + // finds the first base character, or the first character if there is no base + public static int findFirstBase(String source, int start, int limit) { + int cp; + for (int i = start; i < limit; i += UTF16.getCharCount(cp)) { + cp = UTF16.charAt(source, i); + byte cat = Default.ucd.getCategory(cp); + if (((1< " + showData(decomp, INFOPROPS, "\r\n\t")); + shown = true; + } + System.out.println(j + ": " + tests[k].fileName); + } + } + } + } + } + + static String showData(String source, UnicodeProperty[] props, String separator) { + StringBuffer result = new StringBuffer(); + int cp; + for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { + cp = UTF16.charAt(source, i); + if (i != 0) result.append(separator); + result.append(Default.ucd.getCodeAndName(cp)); + for (int j = 0; j < props.length; ++j) { + result.append(", "); + result.append(props[j].getProperty(SHORT)).append('=').append(props[j].getValue(cp,SHORT)); + } + } + return result.toString(); + } + + // determines if string is of form Base NSM* + static boolean isBaseNSMStar(String source) { + int cp; + int status = 0; + for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { + cp = UTF16.charAt(source, i); + byte cat = Default.ucd.getCategory(cp); + int catMask = 1<" + + fileName + ""); + out.println("

Current (fixed only for consistency):

"); + + + + generateTable(out, false); + out.println("

Recommended:

"); + generateTable(out, true); + out.println(""); + out.close(); + + String[] testCase = new String[50]; + // do main test + + for (int k = 0; k < 2; ++k) { + out = Utility.openPrintWriter(fileName + (k == 0 ? "Test_SHORT.txt" : "Test.txt"), Utility.LATIN1_WINDOWS); + int counter = 0; + + out.println("# Default " + fileName + " Break Test"); + out.println("# Generated: " + Default.getDate() + ", MED"); + out.println("#"); + out.println("# Format:"); + out.println("# (# )? "); + out.println("# contains hex Unicode code points, with "); + out.println("#\t" + BREAK + " wherever there is a break opportunity, and "); + out.println("#\t" + NOBREAK + " wherever there is not."); + out.println("# the format can change, but currently it shows:"); + out.println("#\t- the sample character name"); + out.println("#\t- (x) the line_break property* for the sample character"); + out.println("#\t- [x] the rule that determines whether there is a break or not"); + out.println("#"); + sampleDescription(out); + out.println("# These samples may be extended or changed in the future."); + out.println("#"); + + for (int ii = 0; ii < sampleLimit; ++ii) { + String before = samples[ii]; + + for (int jj = 0; jj < sampleLimit; ++jj) { + Utility.dot(counter); + String after = samples[jj]; + + // do line straight + int len = genTestItems(before, after, testCase); + for (int q = 0; q < len; ++q) { + printLine(out, testCase[q], k != 0 && q == 0, false, false); + ++counter; + } + } + } + + for (int ii = 0; ii < extraSingleSamples.length; ++ii) { + printLine(out, extraSingleSamples[ii], true, false, false); + } + out.println("# Lines: " + counter); + out.close(); + } + } + + public void sampleDescription(PrintWriter out) {} + + abstract public boolean isBreak(String source, int offset, boolean recommended); + + abstract public byte getType (int cp, boolean recommended); + + abstract public String getTypeID(int s, boolean recommended); + + + final public byte getType (int cp) { + return getType(cp, false); + } + + final public String getTypeID(int cp) { + return getTypeID(cp, false); + } + + final public String getTypeID(String s) { + return getTypeID(s, false); + } + + public String getTypeID(String s, boolean recommended) { + if (s == null) return ""; + if (s.length() == 1) return getTypeID(s.charAt(0), recommended); + StringBuffer result = new StringBuffer(); + int cp; + for (int i = 0; i < s.length(); i += UTF32.count16(cp)) { + cp = UTF32.char32At(s, i); + if (i > 0) result.append(" "); + result.append(getTypeID(cp, recommended)); + } + return result.toString(); + } + + static final int DONE = -1; + + public int next(String source, int offset, boolean recommended) { + for (int i = offset + 1; i <= source.length(); ++i) { + if (isBreak(source, i, recommended)) return i; + } + return DONE; + } + + public int previous(String source, int offset, boolean recommended) { + for (int i = offset - 1; i >= 0; --i) { + if (isBreak(source, i, recommended)) return i; + } + return DONE; + } + + public int genTestItems(String before, String after, String[] results) { + results[0] = before + after; + return 1; + } + + public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) { + boolean normalBreak = isBreak(before + after, before.length(), recommended); + String normalRule = rule; + ruleOut[0] = rule; + return normalBreak ? BREAK : NOBREAK; + } + + public byte getResolvedType(int cp, boolean recommended) { + return getType(cp, recommended); + } + + boolean skipType(int type) { + return false; + } + + static String getInfo(String s) { + if (s == null || s.length() == 0) return "NULL"; + StringBuffer result = new StringBuffer(); + int cp; + for (int i = 0; i < s.length(); i += UTF32.count16(cp)) { + cp = UTF32.char32At(s, i); + if (i > 0) result.append(", "); + result.append(Default.ucd.getCodeAndName(cp)); + result.append(", gc=" + Default.ucd.getCategoryID_fromIndex(Default.ucd.getCategory(cp),SHORT)); + result.append(", sc=" + Default.ucd.getScriptID_fromIndex(Default.ucd.getScript(cp),SHORT)); + result.append(", lb=" + Default.ucd.getLineBreakID_fromIndex(Default.ucd.getLineBreak(cp))); + } + return result.toString(); + } + + public void generateTable(PrintWriter out, boolean recommended) { + String width = "width='" + (100 / (tableLimit + 1)) + "%'"; + out.print(""); + String types = ""; + String codes = ""; + for (int type = 0; type < tableLimit; ++type) { + String after = samples[type]; + if (after == null) continue; + + String h = getTypeID(after, recommended); + types += ""; + codes += ""; + } + + out.println("" + types + ""); + out.println("" + codes + ""); + + String[] rule = new String[1]; + String[] rule2 = new String[1]; + for (int type = 0; type < tableLimit; ++type) { + String before = samples[type]; + if (before == null) continue; + + String line = "" + + ""; + + for (int type2 = 0; type2 < tableLimit; ++type2) { + String after = samples[type2]; + if (after == null) continue; + + String t = getTableEntry(before, after, recommended, rule); + String background = ""; + String t2 = getTableEntry(before, after, !recommended, rule2); + if (!t.equals(t2)) { + if (t.equals(NOBREAK)) { + background = " bgcolor='#CCFFFF'"; + } else { + background = " bgcolor='#FFFF00'"; + } + } else if (t.equals(NOBREAK)) { + background = " bgcolor='#CCCCFF'"; + } + line += ""; + } + out.println(line + ""); + } + out.println("
" + h + "" + Utility.hex(after) + "
" + getTypeID(before, recommended) + "" + Utility.hex(before) + "" + t + "
"); + out.println("
    "); + for (int ii = 0; ii < extraSingleSamples.length; ++ii) { + out.println("
  1. "); + printLine(out, extraSingleSamples[ii], true, false, true); + out.println("
  2. "); + } + out.println("
"); + } + + static final String BREAK = "\u00F7"; + static final String NOBREAK = "\u00D7"; + + public void printLine(PrintWriter out, String source, boolean comments, boolean recommended, boolean html) { + int cp; + StringBuffer string = new StringBuffer(); + StringBuffer comment = new StringBuffer("\t# "); + boolean hasBreak = isBreak(source, 0, recommended); + String status; + if (html) { + status = hasBreak ? " style='border-right: 1px solid blue'" : ""; + string.append("  "); + } else { + status = hasBreak ? BREAK : NOBREAK; + string.append(status); + } + comment.append(' ').append(status).append(" [").append(rule).append(']'); + + for (int offset = 0; offset < source.length(); offset += UTF16.getCharCount(cp)) { + + cp = UTF16.charAt(source, offset); + hasBreak = isBreak(source, offset + UTF16.getCharCount(cp), recommended); + + if (html) { + status = hasBreak ? " style='border-right: 1px solid blue'" : ""; + string.append("" + + Utility.quoteXML(Utility.getDisplay(cp), true) + + ""); + string.append("  "); + } else { + if (string.length() > 0) { + string.append(' '); + comment.append(' '); + } + + status = hasBreak ? BREAK : NOBREAK; + + string.append(Utility.hex(cp)); + comment.append(Default.ucd.getName(cp) + " (" + getTypeID(cp) + ")"); + string.append(' ').append(status); + comment.append(' ').append(status).append(" [").append(rule).append(']'); + } + } + + if (comments && !html) string.append(comment); + out.println(string); + } + + public void findSamples() { + + // what we want is a list of sample characters. In the simple case, this is just one per type. + // However, if there are characters that have different types (when recommended or not), then + // we want a type for each cross-section + + BitSet bitset = new BitSet(); + Map list = new TreeMap(); + + for (int i = 1; i <= 0x10FFFF; ++i) { + if (!Default.ucd.isAllocated(i)) continue; + if (0xD800 <= i && i <= 0xDFFF) continue; + if (DEBUG && i == 0x1100) { + System.out.println("debug"); + } + byte lb = getType(i); + byte lb2 = getType(i, true); + if (lb == lb2 && skipType(lb)) continue; + + int combined = (lb << 7) + lb2; + if (!bitset.get(combined)) { + bitset.set(combined); + list.put(new Integer(combined), UTF16.valueOf(i)); + } + /* + // if the sample slot is full OR + if (samples[lb] == null) { + samples[lb] = UTF16.valueOf(i); + if (sampleLimit <= lb) sampleLimit = lb + 1; + // byte lb2 = getType(i, true); + // if (lb2 != lb) bs.set(lb); + } + */ + } + + Iterator it = list.keySet().iterator(); + while (it.hasNext()) { + String sample = (String)list.get(it.next()); + samples[sampleLimit++] = sample; + if (DEBUG) System.out.println(getTypeID(sample) + ":\t" + Default.ucd.getCodeAndName(sample)); + } + + tableLimit = sampleLimit; + + // now add values that are different + /* + + for (int i = 1; i <= 0x10FFFF; ++i) { + if (!Default.ucd.isAllocated(i)) continue; + if (0xD800 <= i && i <= 0xDFFF) continue; + byte lb = getType(i); + byte lb2 = getType(i, true); + if (lb == lb2) continue; + // pick some different ones + if (!bs.get(lb)) { + samples[sampleLimit++] = UTF16.valueOf(i); + bs.set(lb); + } + if (!bs2.get(lb2)) { + samples[sampleLimit++] = UTF16.valueOf(i); + bs.set(lb2); + } + } + */ + + System.arraycopy(extraSamples, 0, samples, sampleLimit, extraSamples.length); + sampleLimit += extraSamples.length; + } + + public int findLastNon(String source, int offset, byte notLBType, boolean recommended) { + int cp; + for (int i = offset-1; i >= 0; i -= UTF16.getCharCount(cp)) { + cp = UTF16.charAt(source, i); + byte f = getResolvedType(cp, recommended); + if (f != notLBType) return i; + } + return -1; + } + + + // ======================================== + + static class GenerateLineBreakTest extends GenerateBreakTest { + // all the other items are supplied in UCD_TYPES + static byte LB_L = LB_LIMIT + hL, LB_V = LB_LIMIT + hV, LB_T = LB_LIMIT + hT, + LB_LV = LB_LIMIT + hLV, LB_LVT = LB_LIMIT + hLVT, LB_SUP = LB_LIMIT + hLIMIT, + LB2_LIMIT = (byte)(LB_SUP + 1); + + /* + private byte[] AsmusOrderToMyOrder = { + LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO, + LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM, + // missing from Pair Table + LB_SP, LB_BK, LB_CR, LB_LF, + // resolved types below + LB_CB, LB_AI, LB_SA, LB_SG, LB_XX, + // 3 JAMO CLASSES, plus supplementary + LB_L, LB_V, LB_T, LB_LV, LB_LVT, LB_SUP + }; + + private byte[] MyOrderToAsmusOrder = new byte[AsmusOrderToMyOrder.length]; + { + for (byte i = 0; i < AsmusOrderToMyOrder.length; ++i) { + MyOrderToAsmusOrder[AsmusOrderToMyOrder[i]] = i; + } + */ + { + fileName = "Line"; + extraSingleSamples = new String[] {"can't", "can\u2019t", "ab\u00ADby" }; + } + + + public void sampleDescription(PrintWriter out) { + out.println("# Samples:"); + out.println("# The test currently takes all pairs of linebreak types*,"); + out.println("# picks a sample for each type, and generates three strings: "); + out.println("#\t- the pair alone"); + out.println("#\t- the pair alone with an imbeded space"); + out.println("#\t- the pair alone with embedded combining marks"); + out.println("# The sample for each type is simply the first code point (above NULL)"); + out.println("# with that property."); + out.println("# * Note:"); + out.println("#\t- SG is omitted"); + out.println("#\t- 3 different Jamo characters and a supplementary character are added"); + out.println("#\t The syllable types for the Jamo (L, V, T) are displayed in comments"); + out.println("#\t instead of the linebreak property"); + out.println("#"); + } + + // stuff that subclasses need to override + public int genTestItems(String before, String after, String[] results) { + results[0] = before + after; + results[1] = before + " " + after; + results[2] = before + "\u0301\u0308" + after; + return 3; + } + + // stuff that subclasses need to override + boolean skipType(int type) { + return type == LB_AI || type == LB_SA || type == LB_SG || type == LB_XX; + } + + // stuff that subclasses need to override + public String getTypeID(int cp, boolean recommended) { + byte result = getType(cp, recommended); + if (result == LB_SUP) return "SUP"; + if (result >= LB_LIMIT) return hNames[result - LB_LIMIT]; + return Default.ucd.getLineBreakID_fromIndex(result); // AsmusOrderToMyOrder[result]); + } + + // stuff that subclasses need to override + public byte getType(int cp, boolean recommended) { + if (cp > 0xFFFF) return LB_SUP; + byte result = getHangulType(cp); + if (result != hNot) return (byte)(result + LB_LIMIT); + // return MyOrderToAsmusOrder[Default.ucd.getLineBreak(cp)]; + return Default.ucd.getLineBreak(cp); + } + + public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) { + String t = "_"; + boolean spaceBreak = isBreak(before + " " + after, before.length() + 1, recommended); + String spaceRule = rule; + + boolean spaceBreak2 = isBreak(before + " " + after, before.length(), recommended); + String spaceRule2 = rule; + + boolean normalBreak = isBreak(before + after, before.length(), recommended); + String normalRule = rule; + + if (!normalBreak) { + if (!spaceBreak && !spaceBreak2) { + t = "^"; + rule = spaceRule.equals(normalRule) ? normalRule : spaceRule + "/" + normalRule; + if (!spaceRule2.equals(normalRule) && !spaceRule2.equals(spaceRule)) { + rule += "/" + spaceRule2; + } + } else { + t = "%"; + rule = normalRule; + } + } + ruleOut[0] = rule; + return t; + } + + + public byte getResolvedType (int cp, boolean recommended) { + // LB 1 Assign a line break category to each character of the input. + // Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm. + byte result = getType(cp); + switch (result) { + case LB_AI: result = LB_AI; break; + // case LB_CB: result = LB_ID; break; + case LB_SA: result = LB_AL; break; + // case LB_SG: result = LB_XX; break; Surrogates; will never occur + case LB_XX: result = LB_AL; break; + } + if (recommended) { + if (getHangulType(cp) != hNot) { + result = LB_ID; + } + } + + return result; + } + + // find out whether there is a break at offset + // WARNING: as a side effect, sets "rule" + + public boolean isBreak(String source, int offset, boolean recommended) { + + // LB 1 Assign a line break category to each character of the input. + // Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm. + // this is taken care of in the getResolvedType function + + // LB 2a Never break at the start of text + + rule="2a"; + if (offset <= 0) return false; + + // LB 2b Always break at the end of text + + rule="2b"; + if (offset >= source.length()) return true; + + + // UTF-16: never break in the middle of a code point + if (!onCodepointBoundary(source, offset)) return false; + + + // now get the character before and after, and their types + + + int cpBefore = UTF16.charAt(source, offset-1); + int cpAfter = UTF16.charAt(source, offset); + + byte before = getResolvedType(cpBefore, recommended); + byte after = getResolvedType(cpAfter, recommended); + + + rule="3a"; + // Always break after hard line breaks (but never between CR and LF). + // CR ^ LF + if (before == LB_CR && after == LB_LF) return false; + if (before == LB_BK || before == LB_LF || before == LB_CR) return true; + + //LB 3b Don’t break before hard line breaks. + rule="3b"; + if (after == LB_BK || after == LB_LF | after == LB_CR) return false; + + // LB 4 Don’t break before spaces or zero-width space. + // × SP + // × ZW + + rule="4"; + if (after == LB_SP || after == LB_ZW) return false; + + // LB 5 Break after zero-width space. + // ZW ÷ + rule="5"; + if (before == LB_ZW) return true; + + // LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos. + rule="6"; + if (after == LB_CM) return false; + + if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false; + + if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false; + + if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false; + + boolean setBase = false; + if (before == LB_CM) { + setBase = true; + int backOffset = findLastNon(source, offset, LB_CM, recommended); + if (backOffset < 0) { + before = LB_ID; + } else { + before = getResolvedType(UTF16.charAt(source, backOffset), recommended); + } + } + + // LB 7 In all of the following rules, if a space is the base character for a combining mark, + // the space is changed to type ID. In other words, break before SP CM* in the same cases as + // one would break before an ID. + rule="7"; + if (setBase && before == LB_SP) before = LB_ID; + + // LB 8 Don’t break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. + // × CL, × EX, × IS, × SY + rule="8"; + if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false; + + + // find the last non-space character; we will need it + byte lastNonSpace = before; + if (lastNonSpace == LB_SP) { + int backOffset = findLastNon(source, offset, LB_CM, recommended); + if (backOffset >= 0) { + lastNonSpace = getResolvedType(UTF16.charAt(source, backOffset), recommended); + } + } + + // LB 9 Don’t break after ‘[’, even after spaces. + // OP SP* × + rule="9"; + if (lastNonSpace == LB_OP) return false; + + // LB 10 Don’t break within ‘”[’, , even with intervening spaces. + // QU SP* × OP + rule="10"; + if (lastNonSpace == LB_QU && after == LB_OP) return false; + + // LB 11 Don’t break within ‘]h’, even with intervening spaces. + // CL SP* × NS + rule="11"; + if (lastNonSpace == LB_CL && after == LB_NS) return false; + + // LB 11a Don’t break within ‘——’, even with intervening spaces. + // B2 × B2 + rule="11a"; + if (lastNonSpace == LB_B2 && after == LB_B2) return false; + + + if (recommended) { + // LB 13 Don’t break before or after NBSP or WORD JOINER + // × GL + // GL × + + rule="11b"; + if (after == LB_GL || before == LB_GL) return false; + } + + // [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.] + + rule="12"; + // LB 12 Break after spaces + // SP ÷ + + if (before == LB_SP) return true; + + if (!recommended) { + // LB 13 Don’t break before or after NBSP or WORD JOINER + // × GL + // GL × + + rule="13"; + if (after == LB_GL || before == LB_GL) return false; + } + + rule="14"; + // LB 14 Don’t break before or after ‘”’ + // × QU + // QU × + if (before == LB_QU || after == LB_QU) return false; + + // LB 15 Don’t break before hyphen-minus, other hyphens, fixed-width spaces, + // small kana and other non- starters, or after acute accents: + // × BA + // × HY + // × NS + // BB × + + if (recommended) { + // LB 14a Break before and after CB + // CB ÷ + // ÷ CB + if (before == LB_CB || after == LB_CB) return true; + + } + + rule="15"; + if (after == LB_NS) return false; + if (after == LB_HY) return false; + if (after == LB_BA) return false; + if (before == LB_BB) return false; + + if (!recommended) { + // LB 15b Break after hyphen-minus, and before acute accents: + // HY ÷ + // ÷ BB + + rule="15b"; + if (before == LB_HY) return true; + if (after == LB_BB) return true; + } + + // LB 16 Don’t break between two ellipses, or between letters or numbers and ellipsis: + // AL × IN + // ID × IN + // IN × IN + // NU × IN + // Examples: ’9...’, ‘a...’, ‘H...’ + rule="16"; + if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false; + if (before == LB_IN && after == LB_IN) return false; + + // Don't break alphanumerics. + // LB 17 Don’t break within ‘a9’, ‘3a’, or ‘H%’ + // ID × PO + // AL × NU + // NU × AL + // Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ? + // Examples: $(12.35) 2,1234 (12)¢ 12.54¢ + // This is approximated with the following rules. (Some cases already handled above, + // like ‘9,’, ‘[9’.) + rule="17"; + if (before == LB_ID && after == LB_PO) return false; + if (before == LB_AL && after == LB_NU) return false; + if (before == LB_NU && after == LB_AL) return false; + + // LB 18 Don’t break between the following pairs of classes. + // CL × PO + // HY × NU + // IS × NU + // NU × NU + // NU × PO + // PR × AL + // PR × HY + // PR × ID + // PR × NU + // PR × OP + // SY × NU + // Example pairs: ‘$9’, ‘$[’, ‘$-‘, ‘-9’, ‘/9’, ‘99’, ‘,9’, ‘9%’ ‘]%’ + + rule="18"; + if (before == LB_CL && after == LB_PO) return false; + if (before == LB_HY && after == LB_NU) return false; + if (before == LB_IS && after == LB_NU) return false; + if (before == LB_NU && after == LB_NU) return false; + if (before == LB_NU && after == LB_PO) return false; + + if (before == LB_PR && after == LB_AL) return false; + if (before == LB_PR && after == LB_HY) return false; + if (before == LB_PR && after == LB_ID) return false; + if (before == LB_PR && after == LB_NU) return false; + if (before == LB_PR && after == LB_OP) return false; + + if (before == LB_SY && after == LB_NU) return false; + + if (recommended) { + // LB 15b Break after hyphen-minus, and before acute accents: + // HY ÷ + // ÷ BB + + rule="18b"; + if (before == LB_HY) return true; + if (after == LB_BB) return true; + } + + // LB 19 Don’t break between alphabetics (“at”) + // AL × AL + + rule="19"; + if (before == LB_AL && after == LB_AL) return false; + + // LB 20 Break everywhere else + // ALL ÷ + // ÷ ALL + + rule="20"; + return true; + } + } + + //============================================== + + static class GenerateGraphemeBreakTest extends GenerateBreakTest { + + static final byte CR = 0, LF = 1, Control = 2, Extend = 3, Link = 4, CGJ = 5, Base = 6, LetterBase = 7, Other = 8, + oLIMIT = 9, // RESET THIS IF LIST ABOVE CHANGES! + L = oLIMIT + hL, V = oLIMIT + hV, T = oLIMIT + hT, LV = oLIMIT + hLV, LVT = oLIMIT + hLVT, + LIMIT = LVT + 1; + + static final String[] Names = {"CR", "LF", "CTL", "Extend", "Link", "CGJ", "Base", "LetterBase", "Other" }; + + static UnicodeProperty extendProp = UnifiedBinaryProperty.make(DERIVED | GraphemeExtend); + static UnicodeProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase); + static UnicodeProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink); + static UnicodeSet otherExtendSet = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Other_GraphemeExtend).getSet() + .addAll(new UnicodeSet("[\u09BE\u09D7\u0B3E\u0B57\u0BD7\u0BBE" + + "\u0CC2\u0CD5\u0CD6\u0D3E\u0D57\u0DCF\u0DDF\\U0001D165\\U0001D16E\\U0001D16F]")); + + { + fileName = "GraphemeCluster"; + } + + // stuff that subclasses need to override + public String getTypeID(int cp, boolean recommended) { + byte type = getType(cp, recommended); + if (type >= oLIMIT) return hNames[type - oLIMIT]; + return Names[type]; + } + + // stuff that subclasses need to override + public byte getType(int cp, boolean recommended) { + // single characters + if (cp == 0xA) return LF; + if (cp == 0xD) return CR; + if (recommended) { + if (cp == 0x034F) return CGJ; + } + if (cp == 0x2028 || cp == 0x2029) return Control; + + // Hangul + byte result = getHangulType(cp); + if (result != hNot) return (byte)(result + oLIMIT); + + // other properties + // category based + byte cat = Default.ucd.getCategory(cp); + if (cat == Cc) return Control; + if (recommended) { + if (cat == Cf) return Control; + if (cat == Me || cat == Mn) return Extend; + if (otherExtendSet.contains(cp)) return Extend; + return Base; + } + if (cat == Cf) return Extend; + + if (((1<= source.length() ? -1 : UTF16.charAt(source, a2Offset); + } else { + if (DEBUG) { + System.out.println("stop here"); + } + int a1 = next(source, offset, recommended); + context.cpAfter = findFirstBase(source, offset, a1); + + int b1 = previous(source, offset, recommended); + context.cpBefore = findFirstBase(source, b1, offset); + + int a2 = next(source, a1, recommended); + if (a2 != DONE) context.cpAfter2 = findFirstBase(source, a1, a2); + + int b2 = previous(source, b1, recommended); + if (b2 != DONE) context.cpBefore2 = findFirstBase(source, b2, b1); + } + } + + public boolean isBreak(String source, int offset, boolean recommended) { + rule="1"; + if (offset < 0 || offset > source.length()) return false; + if (offset == 0) return true; + + rule = "2"; + if (offset == source.length()) return true; + + // UTF-16: never break in the middle of a code point + if (!onCodepointBoundary(source, offset)) return false; + + // now get the character before and after, and their types + + + int cpBefore = UTF16.charAt(source, offset-1); + int cpAfter = UTF16.charAt(source, offset); + + byte before = getResolvedType(cpBefore, recommended); + byte after = getResolvedType(cpAfter, recommended); + + rule = "3"; + if (before == CR && after == LF) return false; + + rule = "4"; + if (before == CR || before == LF || before == Control + || after == Control || after == LF || after == CR) return true; + + rule = "6"; + if (before == L && (after == L || after == V || after == LV || after == LVT)) return false; + + rule = "7"; + if ((before == LV || before == V) && (after == V || after == T)) return false; + + rule = "8"; + if ((before == LVT || before == T) && (after == T)) return false; + + rule = "9"; + if (after == Extend) return false; + + if (recommended) { + if (after == Link || after == CGJ) return false; + } else { + + // Do not break around a CGJ. + rule = "10"; + if (before == CGJ && (after == Base + || after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT)) return false; + rule = "11"; + if (after == CGJ) return false; + + // Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together. + + rule = "12"; + //Link Extend* × LetterBase (12) + if (after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT) { + int backOffset = findLastNon(source, offset, Extend, recommended); + if (backOffset >= 0) { + byte last = getResolvedType(UTF16.charAt(source, backOffset), recommended); + if (last == Link) return false; + } + } + + rule = "13"; + if (after == Link) return false; + } + + // Otherwise break after all characters. + rule = "14"; + return true; + + } + + } + + //============================================== + + static class GenerateWordBreakTest extends GenerateBreakTest { + + static final byte Hiragana = 0, Katakana = 1, Letter = 2, MidLetter = 3, Hyphen = 4, + Numeric = 5, Infix_Numeric = 6, Prefix_Numeric = 7, Postfix_Numeric = 8, + Prefix = 9, Postfix = 10, Other = 11, + LIMIT = Other + 1; + + static final String[] Names = {"Hiragana", "Katakana", "Letter", "MidLetter", "Hyphen", + "Numeric", "INum", "PrNum", "PoNum", "PreLet", "PostLet", "Other" }; + + GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest(); + GenerateGraphemeBreakTest.Context context = new GenerateGraphemeBreakTest.Context(); + + static String LENGTH = "[\u30FC\uFF70]"; + static String HALFWIDTH_KATAKANA = "[\uFF65-\uFF9F]"; + static String KATAKANA_ITERATION = "[\u30FD\u30FE]"; + static String HIRAGANA_ITERATION = "[\u309D\u309E]"; + + static UnicodeSet extraKatakana = new UnicodeSet("[" + LENGTH + HALFWIDTH_KATAKANA + KATAKANA_ITERATION + "]"); + + static UnicodeProperty LineBreakIdeographic = UnifiedBinaryProperty.make(LINE_BREAK | LB_ID); + static UnicodeProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase); + static UnicodeProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink); + + static UnicodeSet prefixSet = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Logical_Order_Exception).getSet(); + static UnicodeSet postfixSet; + static UnicodeSet exceptionLetters; + static UnicodeSet normalLetters; + static UnicodeSet thaiLaoNormal; + + static UnicodeSet marks = UnifiedBinaryProperty.make(CATEGORY | Mn).getSet() + .addAll(UnifiedBinaryProperty.make(CATEGORY | Me).getSet()); + + static UnicodeSet oughtToBeLm = new UnicodeSet("[\u02B9-\u02BA\u02C2-\u02CF\u02D2-\u02DF\u02E5-\u02ED]"); +/* +U+02B9..U+02BA # MODIFIER LETTER PRIME..MODIFIER LETTER DOUBLE PRIME +U+02C2..U+02CF # MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LETTER LOW ACUTE ACCENT +U+02D2..U+02DF # MODIFIER LETTER CENTRED RIGHT HALF RING..MODIFIER LETTER CROSS ACCE +U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRATED +*/ + + static UnicodeSet letterSet = UnifiedBinaryProperty.make(CATEGORY | Lo).getSet() + .addAll(UnifiedBinaryProperty.make(CATEGORY | Lu).getSet()) + .addAll(UnifiedBinaryProperty.make(CATEGORY | Lt).getSet()) + .addAll(UnifiedBinaryProperty.make(CATEGORY | Ll).getSet()) + .addAll(UnifiedBinaryProperty.make(CATEGORY | Lm).getSet()) + .addAll(UnifiedBinaryProperty.make(CATEGORY | Mc).getSet()) + .addAll(oughtToBeLm); + + { + + UnicodeSet linebreakNS = UnifiedBinaryProperty.make(LINE_BREAK | LB_NS).getSet(); + + postfixSet = new UnicodeSet(linebreakNS) + .retainAll(letterSet) + .addAll(new UnicodeSet("[\u0e30-\u0E3A\u0e45-\u0e4e]")) + .addAll(new UnicodeSet("[[\u0eb0-\u0EBd\u0ec6-\u0ece]-[:Cn:]]")) + .removeAll(new UnicodeSet("[:mn:]")) + .removeAll(new UnicodeSet("[:katakana:]").addAll(extraKatakana)); + + thaiLaoNormal = new UnicodeSet("[[:thai:][:lao:]]").removeAll(prefixSet).removeAll(postfixSet); + + // we want ideographics, hiragana, thai (except prefix/suffix) + UnicodeSet compatIdeographics = new UnicodeSet("[\uf900-\ufa6a\\U0002F800-\\U0002FA1D]"); + + UnicodeSet hiragana = UnifiedBinaryProperty.make(SCRIPT | HIRAGANA_SCRIPT).getSet(); + UnicodeSet smallHiragana = new UnicodeSet(hiragana).retainAll(linebreakNS); + + exceptionLetters = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Ideographic).getSet() + .addAll(new UnicodeSet("[[:thai:][:lao:]]")) + .addAll(compatIdeographics) + .addAll(hiragana) + .addAll(thaiLaoNormal); + + normalLetters = new UnicodeSet(letterSet).removeAll(exceptionLetters); + + UnicodeSet missingKatakana = new UnicodeSet(extraKatakana).removeAll(new UnicodeSet("[:katakana:]")); + + if (DEBUG) { + System.out.println("compatIdeographics: " + compatIdeographics.toPattern(true)); + Utility.showSetNames("", compatIdeographics, false, Default.ucd); + + System.out.println("missingKatakana: " + missingKatakana.toPattern(true)); + Utility.showSetNames("", missingKatakana, false, Default.ucd); + + + System.out.println("oughtToBeLm: " + oughtToBeLm.toPattern(true)); + Utility.showSetNames("", oughtToBeLm, false, Default.ucd); + + + System.out.println("Prefix: " + prefixSet.toPattern(true)); + Utility.showSetNames("", prefixSet, false, Default.ucd); + + System.out.println("Postfix: " + postfixSet.toPattern(true)); + Utility.showSetNames("", postfixSet, false, Default.ucd); + + System.out.println("exceptionLetters: " + exceptionLetters.toPattern(true)); + + System.out.println("hiragana: " + hiragana.toPattern(true)); + + System.out.println("smallHiragana: " + hiragana.toPattern(true)); + Utility.showSetNames("", smallHiragana, true, Default.ucd); + + System.out.println("midLetterSet: " + midLetterSet.toPattern(true)); + Utility.showSetNames("", midLetterSet, true, Default.ucd); + } + + + fileName = "Word"; + extraSamples = new String[] { + "\uFF70", "\uFF65", "\u30FD" + }; + if (DEBUG) { + System.out.println("length not covered: " + + new UnicodeSet(LENGTH).removeAll(new UnicodeSet("[:katakana:]")).toPattern(true)); + System.out.println("half-width not covered: " + + new UnicodeSet(HALFWIDTH_KATAKANA).removeAll(new UnicodeSet("[:katakana:]")).toPattern(true)); + System.out.println("k.iteration not covered: " + + new UnicodeSet(KATAKANA_ITERATION).removeAll(new UnicodeSet("[:katakana:]")).toPattern(true)); + System.out.println("h.iteration not covered: " + + new UnicodeSet(HIRAGANA_ITERATION).removeAll(new UnicodeSet("[:hiragana:]")).toPattern(true)); + System.out.println("L1: " + getTypeID('\u30FC')); + System.out.println("L2: " + getTypeID('\uFF70')); + } + + extraSingleSamples = new String[] {"can't", "can\u2019t", "ab\u00ADby", "a$-3.14%b", "3a" }; + + } + + // stuff that subclasses need to override + public String getTypeID(int cp, boolean recommended) { + byte type = getType(cp, recommended); + return Names[type]; + } + + // stuff that subclasses need to override + public byte getType(int cp, boolean recommended) { + byte cat = Default.ucd.getCategory(cp); + byte script = Default.ucd.getScript(cp); + + if (recommended) { + //if (prefixSet.contains(cp)) return Prefix; + //if (postfixSet.contains(cp)) return Postfix; + //if (exceptionLetters.contains(cp)) return XLetter; + } + + boolean isCatLetter = ((1< source.length()) return false; + rule="16"; + if (offset == 0) return true; + + rule="15"; + if (offset == source.length()) return true; + + // Treat a grapheme cluster as if it were a single character: + // the first base character, if there is one; otherwise the first character. + // GC => FB + + rule="1"; + if (!grapheme.isBreak( source, offset, recommended)) return false; + + // now get the base character before and after, and their types + + grapheme.getGraphemeBases(source, offset, recommended, context); + + byte before = getResolvedType(context.cpBefore, recommended); + byte after = getResolvedType(context.cpAfter, recommended); + byte before2 = context.cpBefore2 < 0 ? (byte)-1 : getResolvedType(context.cpBefore2, recommended); + byte after2 = context.cpAfter2 < 0 ? (byte)-1 : getResolvedType(context.cpAfter2, recommended); + + //Don't break between most letters + // Letter × Letter + + rule = "2"; + if (before == Letter && after == Letter) return false; + + // Don’t break letters across certain punctuation + // Letter × MidLetter Letter (3) + // Letter MidLetter × Letter (4) + + /*if (recommended) { + rule = "2a"; + if (before == Prefix && after == Letter) return false; + + rule = "2b"; + if (before == Letter && after == Postfix) return false; + } + */ + + + rule = "3"; + if (before == Letter && after == MidLetter && after2 == Letter) return false; + + rule = "4"; + if (before2 == Letter && before == MidLetter && after == Letter) return false; + + // Don’t break within sequences of digits, or digits adjacent to letters. + + // Numeric × Numeric (5) + rule = "5"; + if (before == Numeric && after == Numeric) return false; + + // Letter × Numeric (6) + rule = "6"; + if (before == Letter && after == Numeric) return false; + + // Numeric × Letter (7) + rule = "7"; + if (before == Numeric && after == Letter) return false; + + + // Don’t break within sequences like: '-3.2' + + // Hyphen × Numeric (8) + rule = "8"; + if (before == Hyphen && after == Numeric) return false; + + // Numeric Infix_Numeric × Numeric (9) + rule = "9"; + if (before2 == Numeric && before == Infix_Numeric && after == Numeric) return false; + + // Numeric × Infix_Numeric Numeric (10) + rule = "10"; + if (before == Numeric && after == Infix_Numeric && after2 == Numeric) return false; + + // Prefix_Numeric × Numeric (11) + rule = "11"; + if (before == Prefix_Numeric && after == Numeric) return false; + + // Numeric × Postfix_Numeric (12) + rule = "12"; + if (before == Numeric && after == Postfix_Numeric) return false; + + // Don't break between Hiragana or Katakana + + if (!recommended) { + // Hiragana × Hiragana (13) + rule = "13"; + if (before == Hiragana && after == Hiragana) return false; + } + + // Katakana × Katakana (14) + rule = "14"; + if (before == Katakana && after == Katakana) return false; + + // Otherwise break always. + rule = "15"; + return true; + + } + + } +} + diff --git a/tools/unicodetools/com/ibm/text/UCD/IANANames.java b/tools/unicodetools/com/ibm/text/UCD/IANANames.java new file mode 100644 index 00000000000..6abfb651bc3 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/IANANames.java @@ -0,0 +1,177 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/IANANames.java,v $ +* $Date: 2002/08/08 15:38:16 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCD; + +import com.ibm.text.utility.*; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.lang.UCharacter; + + +import java.util.*; +import java.text.NumberFormat; +import java.io.*; + +public class IANANames implements UCD_Types { + private Map aliasToBase = new TreeMap(); + private Map aliasToComment = new TreeMap(); + private Map aliasToLine = new TreeMap(); + + public static void testSensitivity() throws IOException { + IANANames iNames = new IANANames(); + Map m = new HashMap(); + Iterator it = iNames.getIterator(); + UnicodeSet removed = new UnicodeSet(); + int maxLength = 0; + while (it.hasNext()) { + String alias = (String) it.next(); + if (maxLength < alias.length()) maxLength = alias.length(); + if (alias.length() > 40) System.out.println("Name >40: " + alias); + if (alias.indexOf(')') >= 0 || alias.indexOf('(') >= 0) System.out.println("Illegal tag: " + alias); + String skeleton = removeNonAlphanumeric(alias, removed); + String other = (String) m.get(skeleton); + if (other != null) { + String base = iNames.getBase(alias); + String otherBase = iNames.getBase(other); + if (!base.equals(otherBase)) { + System.out.println("Collision between: " + alias + " (" + base + ") and " + + other + " (" + otherBase + ")"); + } else { + System.out.println("Alias Variant: " + alias + " and " + other + " (" + base + ")"); + } + } else { + m.put(skeleton, alias); + } + } + System.out.println("Max Length: " + maxLength); + + System.out.println("Characters removed: "); + UnicodeSetIterator usi = new UnicodeSetIterator(removed); + while (usi.next()) { + char c = (char) usi.codepoint; // safe, can't be supplementary + System.out.println("0x" + usi.codepoint + "\t'" + c + "'\t" + UCharacter.getName(usi.codepoint)); + } + } + + public IANANames() throws IOException { + BufferedReader in = Utility.openReadFile(BASE_DIR + "IANA\\character-sets.txt", false); + try { + boolean atStart = true; + String lastName = ""; + int counter = 0; + while (true) { + String line = in.readLine(); + if (line == null) break; + counter++; + if (atStart) { + if (line.startsWith("-------------")) atStart = false; + continue; + } + if (line.trim().length() == 0) continue; + + if (line.startsWith("Name:") || line.startsWith("Alias:")) { + lastName = add(line, lastName, counter); + } else if (line.startsWith("Source:") || line.startsWith("MIBenum:") + || line.startsWith(" ")) { + continue; + } else if (line.equals("REFERENCES")) { + break; + } else { + System.out.println("Unknown Line: " + line); + } + } + } finally { + in.close(); + } + } + + private String add(String line, String baseName, int counter) { + // extract the alias, doing a little validity check + int pos = line.indexOf(": "); + if (pos < 0) throw new IllegalArgumentException("Bad line: " + counter + " '" + line + "'"); + String alias = line.substring(pos+2).trim(); + + // get comment + String comment = null; + pos = alias.indexOf(' '); + if (pos >= 0) { + comment = alias.substring(pos).trim(); + alias = alias.substring(0, pos); + } + + // reset the baseName if we are a name + if (line.startsWith("Name:")) { + baseName = alias; + } + + // store + if (!alias.equals("None")) { + if (false) { + if (baseName.equals(alias)) System.out.println(); + System.out.println("Adding " + alias + "\t=> " + baseName + (comment != null ? "\t(" + comment + ")" : "")); + } + // check if it is stored already + String oldbaseName = (String) aliasToBase.get(alias); + if (oldbaseName != null) { + System.out.println("Duplicate alias (" + alias + ", " + oldbaseName + ", " + baseName + "): " + + counter + " '" + line + "'"); + } + aliasToBase.put(alias, baseName); + if (comment != null) aliasToComment.put(alias, comment); + aliasToLine.put(alias, comment); + } + return baseName; + } + + public Iterator getIterator() { + return aliasToBase.keySet().iterator(); + } + + /** + * Returns the name for this alias, or "" if there is none + */ + public String getBase(String alias) { + return (String) aliasToBase.get(alias); + } + + public static String removeNonAlphanumeric(String s, UnicodeSet removed) { + s = s.toUpperCase(Locale.ENGLISH); // can't have Turkish! + StringBuffer result = new StringBuffer(); + boolean removedZero = false; + for (int i = 0; i < s.length(); ++i) { + char c = s.charAt(i); + if (c == '0') { + char cLast = result.length() > 0 ? result.charAt(result.length() - 1) : '0'; + if ('0' <= cLast && cLast <= '9') { + result.append(c); + } else { + if (!removed.contains(c)) { + System.out.println("Removed '" + c + "' from " + s + " => " + result); + removed.add(c); + } + removedZero = true; + } + } else if (('A' <= c && c <= 'Z') || ('0' <= c && c <= '9')) { + result.append(c); + } else { + if (!removed.contains(c)) { + System.out.println("Removed '" + c + "' from " + s + " => " + result); + removed.add(c); + } + } + } + //if (removedZero) System.out.println("Removed 0 from " + s + " => " + result); + return result.toString(); + } +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/UnifiedProperty.java b/tools/unicodetools/com/ibm/text/UCD/UnifiedProperty.java new file mode 100644 index 00000000000..88e0a2ac53e --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/UnifiedProperty.java @@ -0,0 +1,187 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedProperty.java,v $ +* $Date: 2002/08/08 15:38:16 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCD; +import java.io.*; +import java.util.*; + +import com.ibm.text.utility.*; +import com.ibm.icu.text.UnicodeSet; + +public final class UnifiedProperty extends UnicodeProperty { + int majorProp; + // DerivedProperty dp; + + public static UnicodeProperty make(int propMask) { + return make(propMask, Default.ucd); + } + + public static UnicodeProperty make(int propMask, UCD ucd) { + if ((propMask & 0xFF00) == (BINARY_PROPERTIES & 0xFF00)) { + return UnifiedBinaryProperty.make(propMask, ucd); + } + if ((propMask & 0xFF00) == DERIVED) { + return DerivedProperty.make(propMask & 0xFF, ucd); + } + if (!isDefined(propMask, ucd)) return null; + return getCached(propMask, ucd); + } + + public static UnicodeProperty make(String propID, UCD ucd) { + return make(getPropmask(propID, ucd), ucd); + } + + public static UnicodeSet getSet(int propMask, UCD ucd) { + UnicodeProperty up = make(propMask, ucd); + return up.getSet(); + } + + public static UnicodeSet getSet(String propID, UCD ucd) { + return getSet(getPropmask(propID, ucd), ucd); + } + + private static Map propNameCache = null; + + public static int getPropmask(String propID, UCD ucd) { + + // cache the names + if (propNameCache == null) { + System.out.println("Caching Property Names"); + propNameCache = new HashMap(); + + for (int i = 0; i < LIMIT_ENUM; ++i) { + UnicodeProperty up = UnifiedProperty.make(i, ucd); + if (up == null) continue; + if (!up.isStandard()) continue; + if (up.getValueType() < BINARY) continue; + String shortName = Utility.getSkeleton(up.getProperty(SHORT)); + String longName = Utility.getSkeleton(up.getProperty(LONG)); + Integer result = new Integer(i); + propNameCache.put(longName, result); + propNameCache.put(shortName, result); + } + System.out.println("Done Caching"); + } + + propID = Utility.getSkeleton(propID); + Integer indexObj = (Integer) propNameCache.get(propID); + if (indexObj == null) { + throw new IllegalArgumentException("No property found for " + propID); + } + return indexObj.intValue(); + } + + static Map cache = new HashMap(); + static UCD lastUCD = null; + static int lastPropMask = -1; + static UnifiedProperty lastValue = null; + static Clump probeClump = new Clump(); + + static class Clump { + int prop; + UCD ucd; + public boolean equals(Object other) { + Clump that = (Clump) other; + return (that.prop != prop || !ucd.equals(that)); + } + } + + private static UnifiedProperty getCached(int propMask, UCD ucd) { + System.out.println(ucd); + if (ucd.equals(lastUCD) && propMask == lastPropMask) return lastValue; + probeClump.prop = propMask; + probeClump.ucd = ucd; + UnifiedProperty dp = (UnifiedProperty) cache.get(probeClump); + if (dp == null) { + dp = new UnifiedProperty(propMask, ucd); + cache.put(probeClump, dp); + probeClump = new Clump(); + } + lastUCD = ucd; + lastValue = dp; + lastPropMask = propMask; + return dp; + } + + ///////////////////////////////// + + private UnifiedProperty(int propMask, UCD ucdin) { + ucd = ucdin; + majorProp = propMask >> 8; + + //System.out.println("A: " + getValueType()); + if (majorProp <= (JOINING_GROUP>>8) || majorProp == SCRIPT>>8) setValueType(FLATTENED_BINARY); + //System.out.println("B: " + getValueType()); + + header = UCD_Names.UNIFIED_PROPERTY_HEADERS[majorProp]; + name = UCD_Names.UNIFIED_PROPERTIES[majorProp]; + shortName = UCD_Names.SHORT_UNIFIED_PROPERTIES[majorProp]; + } + + static private boolean isDefined(int propMask, UCD ucd) { + int majorProp = propMask >> 8; + switch (majorProp) { + case CATEGORY>>8: + case COMBINING_CLASS>>8: + case BIDI_CLASS>>8: + case DECOMPOSITION_TYPE>>8: + case NUMERIC_TYPE>>8: + case EAST_ASIAN_WIDTH>>8: + case LINE_BREAK>>8: + case JOINING_TYPE>>8: + case JOINING_GROUP>>8: + case SCRIPT>>8: + case AGE>>8: + return true; + /* + case DERIVED>>8: + UnicodeProperty up = DerivedProperty.make(propValue, ucd); + if (up == null) break; + return up.hasValue(cp); + */ + } + return false; + } + + public boolean hasValue(int cp) { + throw new ChainException("Can't call 'hasValue' on non-binary property {0}", new Object[]{ + new Integer(majorProp)}); + } + + public String getFullName(byte style) { + String pre = ""; + String preShort = getProperty(SHORT); + String preLong = getProperty(LONG); + if (style < LONG) pre = preShort; + else if (style == LONG || preShort.equals(preLong)) pre = preLong; + else pre = preShort + "(" + preLong + ")"; + return pre; + } + + public String getValue(int cp, byte style) { + switch (majorProp) { + case CATEGORY>>8: return ucd.getCategoryID_fromIndex(ucd.getCategory(cp), style); + case COMBINING_CLASS>>8: return ucd.getCombiningClassID_fromIndex(ucd.getCombiningClass(cp), style); + case BIDI_CLASS>>8: return ucd.getBidiClassID_fromIndex(ucd.getBidiClass(cp), style); + case DECOMPOSITION_TYPE>>8: return ucd.getDecompositionTypeID_fromIndex(ucd.getDecompositionType(cp), style); + case NUMERIC_TYPE>>8: return ucd.getNumericTypeID_fromIndex(ucd.getNumericType(cp), style); + case EAST_ASIAN_WIDTH>>8: return ucd.getEastAsianWidthID_fromIndex(ucd.getEastAsianWidth(cp)); + case LINE_BREAK>>8: return ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp)); + case JOINING_TYPE>>8: return ucd.getJoiningTypeID_fromIndex(ucd.getJoiningType(cp)); + case JOINING_GROUP>>8: return ucd.getJoiningGroupID_fromIndex(ucd.getJoiningGroup(cp)); + case SCRIPT>>8: return ucd.getScriptID_fromIndex(ucd.getScript(cp)); + case AGE>>8: return ucd.getAgeID_fromIndex(ucd.getAge(cp)); + default: throw new IllegalArgumentException("Internal Error"); + } + } +} \ No newline at end of file