diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
index 1b0b4e1b59b..728cc9896ba 100644
--- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
+++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
-* $Date: 2002/06/22 21:02:16 $
-* $Revision: 1.16 $
+* $Date: 2002/08/04 21:38:45 $
+* $Revision: 1.17 $
*
*******************************************************************************
*/
@@ -110,7 +110,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
if (ucdData.getDecompositionType(cp) == NONE) return false;
String norm = nfx.normalize(cp);
if (UTF16.countCodePoint(norm) != 1) return true;
@@ -133,7 +133,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# Characters that are cc==0, BUT which may interact with previous characters."
;
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
if (ucdData.getCombiningClass(cp) != 0) return false;
String norm = nfx.normalize(cp);
int first = UTF16.charAt(norm, 0);
@@ -172,7 +172,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
boolean result = bitset.get(cp);
if (result && filter) {
result = (ucdData.getCombiningClass(cp) != 0) == keepNonZero;
@@ -243,7 +243,7 @@ public final class DerivedProperty implements UCD_Types {
//if (cp >= 0xAC00 && cp <= 0xD7A3) return true;
//System.out.println(Utility.hex(cps) + " => " + Utility.hex(nf[i-4].normalize(cps)));
} // default
- boolean hasValue(int cp) { return getValue(cp).length() != 0; }
+ public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
class CaseDProp extends UnicodeProperty {
@@ -256,7 +256,7 @@ public final class DerivedProperty implements UCD_Types {
header = "# Derived Property: " + name
+ "\r\n# Generated from: NFKD has >0 " + CaseNames[i-Missing_Uppercase] + ", no other cases";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == val
|| val != Lt && ucdData.getBinaryProperty(cp, Other_Uppercase)) return false;
@@ -294,7 +294,7 @@ public final class DerivedProperty implements UCD_Types {
return getValue(cp, LONG);
}
- boolean hasValue(int cp) { return getValue(cp).length() != 0; }
+ public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
{
@@ -323,7 +323,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# Characters that can start an identifier."
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
return ucdData.isIdentifierStart(cp, false);
}
};
@@ -338,7 +338,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc"
+ "\r\n# NOTE: Cf characters should be filtered out.";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
return ucdData.isIdentifierContinue_NO_Cf(cp, false);
}
};
@@ -354,7 +354,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
return ucdData.isIdentifierStart(cp, true);
}
};
@@ -371,7 +371,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
return ucdData.isIdentifierContinue_NO_Cf(cp, true);
}
};
@@ -384,7 +384,7 @@ public final class DerivedProperty implements UCD_Types {
header = "# Derived Property: " + name
+ "\r\n# Generated from: Sm + Other_Math";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Sm
|| ucdData.getBinaryProperty(cp,Math_Property)) return true;
@@ -400,7 +400,7 @@ public final class DerivedProperty implements UCD_Types {
header = "# Derived Property: " + name
+ "\r\n# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl
|| ucdData.getBinaryProperty(cp, Alphabetic)) return true;
@@ -416,7 +416,7 @@ public final class DerivedProperty implements UCD_Types {
header = "# Derived Property: " + name
+ "\r\n# Generated from: Ll + Other_Lowercase";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Ll
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return true;
@@ -432,7 +432,7 @@ public final class DerivedProperty implements UCD_Types {
header = "# Derived Property: " + name
+ "\r\n# Generated from: Lu + Other_Uppercase";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lu
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return true;
@@ -461,7 +461,7 @@ of characters, the first of which has a non-zero combining class.
+ ": Full Composition Exclusion"
+ "\r\n# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
if (!ucdData.isRepresented(cp)) return false;
byte dtype = ucdData.getDecompositionType(cp);
if (dtype != CANONICAL) return false;
@@ -488,7 +488,7 @@ of characters, the first of which has a non-zero combining class.
+ ": Full Composition Inclusion"
+ "\r\n# characters with Canonical Decompositions MINUS Full Composition Exclusion";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
if (!ucdData.isRepresented(cp)) return false;
byte dtype = ucdData.getDecompositionType(cp);
if (dtype != CANONICAL) return false;
@@ -516,7 +516,7 @@ of characters, the first of which has a non-zero combining class.
if (c.equals(b)) return "";
return "FNC; " + Utility.hex(c);
} // default
- boolean hasValue(int cp) { return getValue(cp).length() != 0; }
+ public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
dprops[FC_NFC_Closure] = new UnicodeProperty() {
@@ -538,7 +538,7 @@ of characters, the first of which has a non-zero combining class.
if (c.equals(b)) return "";
return "FN; " + Utility.hex(c);
} // default
- boolean hasValue(int cp) { return getValue(cp).length() != 0; }
+ public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
for (int i = QuickNFD; i <= QuickNFKC; ++i) {
@@ -555,7 +555,7 @@ of characters, the first of which has a non-zero combining class.
+ "\r\n# Generated from <2060..206F, FFF0..FFFB, E0000..E0FFF>"
+ "\r\n# + Other_Default_Ignorable_Code_Point + (Cf + Cc + Cs - White_Space)";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
if (0x2060 <= cp && cp <= 0x206F || 0xFFF0 <= cp && cp <= 0xFFFB || 0xE0000 <= cp && cp <= 0xE0FFF) return true;
if (ucdData.getBinaryProperty(cp,Other_Default_Ignorable_Code_Point)) return true;
if (ucdData.getBinaryProperty(cp, White_space)) return false;
@@ -573,7 +573,7 @@ of characters, the first of which has a non-zero combining class.
header = header = "# Binary Property";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
switch(cp) {
case 0x27: case 0x2019: case 0xAD: return true;
// case 0x2d: case 0x2010: case 0x2011:
@@ -600,7 +600,7 @@ of characters, the first of which has a non-zero combining class.
+ "\r\n# - has no combining marks with zero canonical combining class"
;
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
if (hasSoftDot(cp)) return true;
if (Default.nfkd.isNormalized(cp)) return false;
String decomp = Default.nfd.normalize(cp);
@@ -629,7 +629,7 @@ of characters, the first of which has a non-zero combining class.
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: Other_Case_Ignorable + Lm + Mn + Me + Cf";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lm || cat == Cf || cat == Mn || cat == Me) return true;
if (dprops[Other_Case_Ignorable].hasValue(cp)) return true;
@@ -654,7 +654,7 @@ of characters, the first of which has a non-zero combining class.
+ "\r\n# (CGJ = U+034F)";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
if (cp == 0x034F) return false;
if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false;
byte cat = ucdData.getCategory(cp);
@@ -674,7 +674,7 @@ of characters, the first of which has a non-zero combining class.
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp"
+ "\r\n# - Grapheme_Extend - Grapheme_Link - CGJ";
}
- boolean hasValue(int cp) {
+ public boolean hasValue(int cp) {
if (cp == 0x034F) return false;
byte cat = ucdData.getCategory(cp);
if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp
diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
index 13777bab0b6..54195ea7625 100644
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
-* $Date: 2002/07/30 09:56:41 $
-* $Revision: 1.8 $
+* $Date: 2002/08/04 21:38:45 $
+* $Revision: 1.9 $
*
*******************************************************************************
*/
@@ -275,6 +275,11 @@ public final class GenerateHanTransliterator implements UCD_Types {
log = Utility.openPrintWriter("Transliterate_log.txt", Utility.UTF8_WINDOWS);
log.print('\uFEFF');
+ log.println();
+ log.println("@*Override Data");
+ log.println();
+ readOverrides(type);
+
log.println();
log.println("@*DICT Data");
log.println();
@@ -426,7 +431,27 @@ public final class GenerateHanTransliterator implements UCD_Types {
System.out.println("Defined Count: " + count);
log.println();
- log.println("@Duplicates");
+ log.println("@Duplicates (Frequency Order");
+ log.println();
+ it = rankList.iterator();
+ while (it.hasNext()) {
+ String word = (String) it.next();
+ Collection dups = (Collection) duplicates.get(word);
+ if (dups == null) continue;
+ log.print(hex.transliterate(word) + "\t" + word + "\t");
+ Iterator it2 = dups.iterator();
+ boolean gotFirst = false;
+ while (it2.hasNext()) {
+ if (!gotFirst) gotFirst = true;
+ else log.print(", ");
+ log.print(it2.next());
+ }
+ if (overrideSet.contains(word)) log.print(" *override*");
+ log.println();
+ }
+
+ log.println();
+ log.println("@Duplicates (Character Order)");
log.println();
it = duplicates.keySet().iterator();
while (it.hasNext()) {
@@ -440,6 +465,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
else log.print(", ");
log.print(it2.next());
}
+ if (overrideSet.contains(word)) log.print(" *override*");
log.println();
}
@@ -536,13 +562,19 @@ public final class GenerateHanTransliterator implements UCD_Types {
int overallRank = 0;
it = combinedRank.iterator();
- log.println();
- log.println("@Frequency data: Rank of Character");
- log.println();
+ boolean showFrequency = false;
+
+ if (showFrequency) {
+ log.println();
+ log.println("@Frequency data: Rank of Character");
+ log.println();
+ }
+
+ // make up rankMap, rankList
while(it.hasNext()) {
Pair p = (Pair) it.next();
- log.println(p.first + ", " + p.second);
+ if (showFrequency) log.println(p.first + ", " + p.second);
Object rank = rankMap.get(p.second);
if (rank == null) {
rankMap.put(p.second, new Integer(++overallRank));
@@ -550,16 +582,18 @@ public final class GenerateHanTransliterator implements UCD_Types {
}
}
- log.println();
- log.println("@Frequency data: Character to Rank");
- log.println();
-
- // get full order
- it = rankList.iterator();
- while (it.hasNext()) {
- Comparable key = (Comparable) it.next();
- Comparable val = (Comparable) rankMap.get(key);
- log.println(key + ", " + val);
+ if (showFrequency) {
+ log.println();
+ log.println("@Frequency data: Character to Rank");
+ log.println();
+
+ // get full order
+ it = rankList.iterator();
+ while (it.hasNext()) {
+ Comparable key = (Comparable) it.next();
+ Comparable val = (Comparable) rankMap.get(key);
+ log.println(key + ", " + val);
+ }
}
} catch (Exception e) {
@@ -712,6 +746,38 @@ public final class GenerateHanTransliterator implements UCD_Types {
}
}
+ static void readOverrides(int type) throws IOException {
+ if (type != CHINESE) return;
+ String fname = "Chinese_override.txt";
+
+ System.out.println("Reading " + fname);
+ BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, true);
+ int counter = 0;
+ String[] pieces = new String[50];
+ String line = "";
+ try {
+ while (true) {
+ line = Utility.readDataLine(br);
+ if (line == null) break;
+ if (line.length() == 0) continue;
+ Utility.dot(counter++);
+
+ // skip code
+ int wordStart = line.indexOf('\t') + 1;
+ int wordEnd = line.indexOf('\t', wordStart);
+ String word = line.substring(wordStart, wordEnd);
+ String definition = line.substring(wordEnd+1);
+ addCheck(word, definition, line);
+ overrideSet.add(word);
+ }
+ br.close();
+ } catch (Exception e) {
+ throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e);
+ }
+ }
+
+ static Set overrideSet = new HashSet();
+
static void processEdict(String word, String definition, String line) {
// We have a situation where we have words of the form CCCHHHKKKCCHHCCH > HHHHHHKKKHHHHHHHH
// C = CJK, H = Hiragana, K = katakana
diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java b/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java
index 4dac9600f64..1976bedb10f 100644
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $
-* $Date: 2002/07/30 09:57:18 $
-* $Revision: 1.1 $
+* $Date: 2002/08/04 21:38:45 $
+* $Revision: 1.2 $
*
*******************************************************************************
*/
@@ -21,61 +21,126 @@ import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
public class GenerateLineBreakTest implements UCD_Types {
-
- static String[] samples = new String[LB_LIMIT + 3];
- static byte[] TROrder = {
+ // COMMON STUFF for Hangul
+ static final byte hNot = -1, hL = 0, hV = 1, hT = 2, hLV = 3, hLVT = 4, hLIMIT = 5;
+ static final String[] hNames = {"L", "V", "T", "LV", "LVT"};
+
+ static byte getHangulType(int cp) {
+ if (Default.ucd.isLeadingJamo(cp)) return hL;
+ if (Default.ucd.isVowelJamo(cp)) return hV;
+ if (Default.ucd.isTrailingJamo(cp)) return hT;
+ if (Default.ucd.isHangulSyllable(cp)) {
+ if (Default.ucd.isDoubleHangul(cp)) return hLV;
+ return hLVT;
+ }
+ return hNot;
+ }
+
+ //============================
+
+ protected String rule;
+ protected String fileName = "Line";
+
+ // all the other items are supplied in UCD_TYPES
+ static byte LB_L = LB_LIMIT + hL, LB_V = LB_LIMIT + hV, LB_T = LB_LIMIT + hT,
+ LB_LV = LB_LIMIT + hLV, LB_LVT = LB_LIMIT + hLVT, LB_SUP = LB_LIMIT + hLIMIT,
+ LB2_LIMIT = (byte)(LB_SUP + 1);
+
+ String[] samples = new String[100];
+
+
+ byte[] TypeOrder = {
LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO,
LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM,
// missing from Pair Table
LB_SP, LB_BK, LB_CR, LB_LF,
// resolved types below
LB_CB, LB_AI, LB_SA, LB_SG, LB_XX,
- // 3 JAMO CLASSES
- 29, 30, 31
+ // 3 JAMO CLASSES, plus supplementary
+ LB_L, LB_V, LB_T, LB_LV, LB_LVT, LB_SUP
};
- static final int TABLE_LIMIT = 25;
-
public static void main(String[] args) throws IOException {
Default.setUCD();
+ new GenerateLineBreakTest().run();
+ new GenerateWordBreakTest().run();
+ }
+
+ // stuff that subclasses need to override
+ public void run() throws IOException {
findSamples();
// test individual cases
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
- PrintWriter out = Utility.openPrintWriter("LineBreakTest.html", Utility.UTF8_WINDOWS);
- out.println("
Current (fixed only for consistency):
");
+ PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS);
+ out.println(""
+ + fileName + "");
+ out.println("Current (fixed only for consistency):
");
+
+
+
generateTable(out, false);
- out.println("Recommended:
");
+ out.println("Recommended:
");
generateTable(out, true);
out.println("");
out.close();
+ String[] testCase = new String[50];
// do main test
for (int k = 0; k < 2; ++k) {
- out = Utility.openPrintWriter(k == 0 ? "LineBreakTest_SHORT.txt" : "LineBreakTest.txt", Utility.UTF8_WINDOWS);
+ out = Utility.openPrintWriter(fileName + (k == 0 ? "Test_SHORT.txt" : "Test.txt"), Utility.LATIN1_WINDOWS);
int counter = 0;
- out.println("# Default Linebreak conformance test");
- out.println("# " + Default.getDate() + ", MED");
+ out.println("# Default " + fileName + " Break Test");
+ out.println("# Generated: " + Default.getDate() + ", MED");
+ out.println("#");
+ out.println("# Format:");
+ out.println("# (# )? ");
+ out.println("# contains hex Unicode code points, with ");
+ out.println("#\t" + BREAK + " wherever there is a break opportunity, and ");
+ out.println("#\t" + NOBREAK + " wherever there is not.");
+ out.println("# the format can change, but currently it shows:");
+ out.println("#\t- the sample character name");
+ out.println("#\t- (x) the line_break property* for the sample character");
+ out.println("#\t- [x] the rule that determines whether there is a break or not");
+ out.println("#");
+ out.println("# Samples:");
+ out.println("# The test currently takes all pairs of linebreak types*,");
+ out.println("# picks a sample for each type, and generates three strings: ");
+ out.println("#\t- the pair alone");
+ out.println("#\t- the pair alone with an imbeded space");
+ out.println("#\t- the pair alone with embedded combining marks");
+ out.println("# The sample for each type is simply the first code point (above NULL)");
+ out.println("# with that property.");
+ out.println("# * Note:");
+ out.println("#\t- SG is omitted");
+ out.println("#\t- 3 different Jamo characters and a supplementary character are added");
+ out.println("#\t The syllable types for the Jamo (L, V, T) are displayed in comments");
+ out.println("#\t instead of the linebreak property");
+ out.println("# These samples may be extended in the future.");
out.println("#");
- for (int ii = 0; ii < samples.length; ++ii) {
- int i = TROrder[ii];
+ for (int ii = 0; ii < getLimit(); ++ii) {
+ int i = TypeOrder[ii];
+ if (i == LB_SG) continue;
String before = samples[i];
- for (int jj = 0; jj < samples.length; ++jj) {
- Utility.dot(counter++);
- int j = TROrder[jj];
+ for (int jj = 0; jj < getLimit(); ++jj) {
+ Utility.dot(counter);
+ int j = TypeOrder[jj];
+ if (j == LB_SG) continue;
String after = samples[j];
// do line straight
- printLine(out, before, "", after, k != 0);
- printLine(out, before, " ", after, k != 0);
- printLine(out, before, "\u0301\u0308", after, k != 0);
+ int len = genTestItems(before, after, testCase);
+ for (int q = 0; q < len; ++q) {
+ printLine(out, testCase[q], k != 0 && q == 0, false);
+ ++counter;
+ }
}
}
out.println("# Lines: " + counter);
@@ -83,25 +148,80 @@ public class GenerateLineBreakTest implements UCD_Types {
}
}
- public static void generateTable(PrintWriter out, boolean recommended) {
- out.print(" | ");
- for (int i = 0; i < TABLE_LIMIT; ++i) {
- String h = getLBID(samples[TROrder[i]]);
- out.print("" + h + " | ");
+ // stuff that subclasses need to override
+ public int genTestItems(String before, String after, String[] results) {
+ results[0] = before + after;
+ results[1] = before + " " + after;
+ results[2] = before + "\u0301\u0308" + after;
+ return 3;
+ }
+
+ // stuff that subclasses need to override
+ boolean skipType(byte type) {
+ return type == LB_AI || type == LB_SA || type == LB_SG || type == LB_XX;
+ }
+
+ // stuff that subclasses need to override
+ public String getTypeID(int cp) {
+ byte result = getType(cp);
+ if (result == LB_SUP) return "SUP";
+ if (result >= LB_LIMIT) return hNames[result - LB_LIMIT];
+ return Default.ucd.getLineBreakID_fromIndex(result);
+ }
+
+ // stuff that subclasses need to override
+ public byte getType(int cp) {
+ if (cp > 0xFFFF) return LB_SUP;
+ byte result = getHangulType(cp);
+ if (result != hNot) return (byte)(result + LB_LIMIT);
+ return Default.ucd.getLineBreak(cp);
+ }
+
+ public int getLimit() {
+ return LB2_LIMIT;
+ }
+
+ public int getTableLimit() {
+ return LB_SUP; // skip last;
+ }
+
+
+ public void generateTable(PrintWriter out, boolean recommended) {
+ String width = "width='" + (100 / (getTableLimit() + 1)) + "%'";
+ out.print(" | ");
+ byte type;
+ for (int i = 0; i < getTableLimit(); ++i) {
+ type = TypeOrder[i];
+ if (skipType(type)) continue;
+
+ String h = getTypeID(samples[TypeOrder[i]]);
+ out.print("" + h + " | ");
}
out.print("
");
String[] rule = new String[1];
String[] rule2 = new String[1];
- for (int i = 0; i < TABLE_LIMIT; ++i) {
- String before = samples[TROrder[i]];
- String line = "" + getLBID(before) + " | ";
- for (int j = 0; j < TABLE_LIMIT; ++j) {
- String after = samples[TROrder[j]];
+ for (int i = 0; i < getTableLimit(); ++i) {
+ type = TypeOrder[i];
+ if (skipType(type)) continue;
+
+ String before = samples[type];
+ String line = "
---|
" + getTypeID(before) + " | ";
+ for (int j = 0; j < getTableLimit(); ++j) {
+ type = TypeOrder[j];
+ if (skipType(type)) continue;
+
+ String after = samples[type];
String t = getTableEntry(before, after, recommended, rule);
String background = "";
- if (recommended) {
- String t2 = getTableEntry(before, after, false, rule2);
- if (!t.equals(t2)) background = " bgcolor='#FFFF00'";
+ String t2 = getTableEntry(before, after, !recommended, rule2);
+ if (!t.equals(t2)) {
+ if (t.equals(NOBREAK)) {
+ background = " bgcolor='#CCFFFF'";
+ } else {
+ background = " bgcolor='#FFFF00'";
+ }
+ } else if (t.equals(NOBREAK)) {
+ background = " bgcolor='#CCCCFF'";
}
line += "" + t + " | ";
}
@@ -110,7 +230,7 @@ public class GenerateLineBreakTest implements UCD_Types {
out.println("
---|
");
}
- public static String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
+ public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
String t = "_";
boolean spaceBreak = isBreak(before + " " + after, before.length() + 1, recommended);
String spaceRule = rule;
@@ -137,75 +257,83 @@ public class GenerateLineBreakTest implements UCD_Types {
return t;
}
+ static final String BREAK = "\u00F7";
+ static final String NOBREAK = "\u00D7";
- public static void printLine(PrintWriter out, String before, String filler, String after, boolean comments) {
- String s = before + filler + after;
- int offset = before.length() + filler.length();
+ public void printLine(PrintWriter out, String source, boolean comments, boolean recommended) {
+ int cp;
+ StringBuffer string = new StringBuffer();
+ StringBuffer comment = new StringBuffer("\t# ");
+ String status = isBreak(source, 0, recommended) ? BREAK : NOBREAK;
+ string.append(status);
+ comment.append(' ').append(status).append(" [").append(rule).append(']');
- boolean lb = isBreak(s, offset, false);
-
- String tlb = (lb ? "b" : "n");
- String comment = "";
- if (comments) comment =
- " # " + getLBID(before + filler)
- + " " + tlb
- + " " + getLBID(after)
- + " # " + Default.ucd.getName(before + filler)
- + " " + tlb
- + " " + Default.ucd.getName(after);
+ for (int offset = 0; offset < source.length(); offset += UTF16.getCharCount(cp)) {
- out.println(Utility.hex(before + filler)
- + "; " + tlb
- + "; " + Utility.hex(after)
- + comment);
+ cp = UTF16.charAt(source, offset);
+ if (string.length() > 0) {
+ string.append(' ');
+ comment.append(' ');
+ }
+
+ string.append(Utility.hex(cp));
+ comment.append(Default.ucd.getName(cp) + " (" + getTypeID(cp) + ")");
+
+ status = isBreak(source, offset + UTF16.getCharCount(cp), recommended) ? BREAK : NOBREAK;
+ string.append(' ').append(status);
+ comment.append(' ').append(status).append(" [").append(rule).append(']');
+ }
+
+ if (comments) string.append(comment);
+ out.println(string);
}
-
- public static void findSamples() {
+
+ public void findSamples() {
for (int i = 1; i <= 0x10FFFF; ++i) {
if (!Default.ucd.isAllocated(i)) continue;
- if (Default.ucd.isLeadingJamo(i)
- || Default.ucd.isVowelJamo(i)
- || Default.ucd.isTrailingJamo(i)) continue;
- byte lb = Default.ucd.getLineBreak(i);
+ if (0xD800 <= i && i <= 0xDFFF) continue;
+ if(i == 0x1100) {
+ System.out.print("here");
+ }
+ byte lb = getType(i);
if (samples[lb] == null) {
samples[lb] = UTF16.valueOf(i);
}
}
- // fill the last with special cases
- samples[LB_LIMIT] = "\u1100";
- samples[LB_LIMIT+1] = "\u1162";
- samples[LB_LIMIT+2] = "\u11A8";
+ for (int i = 0; i < TypeOrder.length; ++i) {
+ String sample = samples[i];
+ System.out.println(getTypeID(sample) + ":\t" + Default.ucd.getCodeAndName(sample));
+ }
}
- public static String getLBID(String s) {
- if (s.length() == 1) return Default.ucd.getLineBreakID(s.charAt(0));
+ public String getTypeID(String s) {
+ if (s == null) return "";
+ if (s.length() == 1) return getTypeID(s.charAt(0));
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
if (i > 0) result.append(" ");
- result.append(Default.ucd.getLineBreakID(cp));
+ result.append(getTypeID(cp));
}
return result.toString();
}
- static String rule;
-
- public static int findLastNon(String source, int offset, byte notLBType) {
+ public int findLastNon(String source, int offset, byte notLBType, boolean recommended) {
int cp;
- for (int i = offset-2; i >= 0; i -= UTF16.getCharCount(cp)) {
+ for (int i = offset-1; i >= 0; i -= UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
- byte f = getResolvedLB(cp);
- if (f != notLBType) return cp;
+ byte f = getResolvedType(cp, recommended);
+ if (f != notLBType) return i;
}
- return 0;
+ return -1;
}
- public static byte getResolvedLB (int cp) {
+ public byte getResolvedType (int cp, boolean recommended) {
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
- byte result = Default.ucd.getLineBreak(cp);
+ byte result = getType(cp);
switch (result) {
case LB_AI: result = LB_AI; break;
// case LB_CB: result = LB_ID; break;
@@ -213,17 +341,31 @@ public class GenerateLineBreakTest implements UCD_Types {
// case LB_SG: result = LB_XX; break; Surrogates; will never occur
case LB_XX: result = LB_AL; break;
}
+ if (recommended) {
+ if (getHangulType(cp) != hNot) {
+ result = LB_ID;
+ }
+ }
+
return result;
}
-
+
+ public boolean onCodepointBoundary(String s, int offset) {
+ if (offset < 0 || offset > s.length()) return false;
+ if (offset == 0 || offset == s.length()) return true;
+ if (UTF16.isLeadSurrogate(s.charAt(offset-1))
+ && UTF16.isTrailSurrogate(s.charAt(offset))) return false;
+ return true;
+ }
+
// find out whether there is a break at offset
// WARNING: as a side effect, sets "rule"
- public static boolean isBreak(String source, int offset, boolean recommended) {
+ public boolean isBreak(String source, int offset, boolean recommended) {
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
- // this is taken care of in the getResolvedLB function
+ // this is taken care of in the getResolvedType function
// LB 2a Never break at the start of text
@@ -237,8 +379,7 @@ public class GenerateLineBreakTest implements UCD_Types {
// UTF-16: never break in the middle of a code point
- if (UTF16.isLeadSurrogate(source.charAt(offset-1))
- && UTF16.isTrailSurrogate(source.charAt(offset))) return false;
+ if (!onCodepointBoundary(source, offset)) return false;
// now get the character before and after, and their types
@@ -247,8 +388,8 @@ public class GenerateLineBreakTest implements UCD_Types {
int cpBefore = UTF16.charAt(source, offset-1);
int cpAfter = UTF16.charAt(source, offset);
- byte before = getResolvedLB(cpBefore);
- byte after = getResolvedLB(cpAfter);
+ byte before = getResolvedType(cpBefore, recommended);
+ byte after = getResolvedType(cpAfter, recommended);
rule="3a";
@@ -276,22 +417,21 @@ public class GenerateLineBreakTest implements UCD_Types {
// LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
rule="6";
if (after == LB_CM) return false;
- if (Default.ucd.isLeadingJamo(cpBefore)) {
- if (Default.ucd.isLeadingJamo(cpAfter) || Default.ucd.isVowelJamo(cpAfter)) return false;
- } else if (Default.ucd.isVowelJamo(cpBefore)) {
- if (Default.ucd.isVowelJamo(cpAfter) || Default.ucd.isTrailingJamo(cpAfter)) return false;
- } else if (Default.ucd.isTrailingJamo(cpBefore)) {
- if (Default.ucd.isTrailingJamo(cpAfter)) return false;
- }
-
+
+ if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false;
+
+ if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false;
+
+ if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false;
+
boolean setBase = false;
if (before == LB_CM) {
setBase = true;
- int cp = findLastNon(source, offset, LB_CM);
- if (cp == 0) {
+ int backOffset = findLastNon(source, offset, LB_CM, recommended);
+ if (backOffset < 0) {
before = LB_ID;
} else {
- before = getResolvedLB(cp);
+ before = getResolvedType(UTF16.charAt(source, backOffset), recommended);
}
}
@@ -310,9 +450,9 @@ public class GenerateLineBreakTest implements UCD_Types {
// find the last non-space character; we will need it
byte lastNonSpace = before;
if (lastNonSpace == LB_SP) {
- int cp = findLastNon(source, offset, LB_CM);
- if (cp != 0) {
- lastNonSpace = getResolvedLB(cp);
+ int backOffset = findLastNon(source, offset, LB_CM, recommended);
+ if (backOffset >= 0) {
+ lastNonSpace = getResolvedType(UTF16.charAt(source, backOffset), recommended);
}
}
@@ -476,4 +616,162 @@ public class GenerateLineBreakTest implements UCD_Types {
rule="20";
return true;
}
+
+ static class GenerateWordBreakTest extends GenerateLineBreakTest {
+
+ static final byte CR = 0, LF = 1, Control = 2, Extend = 3, Link = 4, CGJ = 5, Base = 6, LetterBase = 7, Other = 8,
+ oLIMIT = 9, // RESET THIS IF LIST ABOVE CHANGES!
+ L = oLIMIT + hL, V = oLIMIT + hV, T = oLIMIT + hT, LV = oLIMIT + hLV, LVT = oLIMIT + hLVT,
+ LIMIT = LVT + 1;
+
+ static final String[] Names = {"CR", "LF", "CTL", "Extend", "Link", "CGJ", "Base", "LetterBase", "Other" };
+
+ static UnicodeProperty extendProp = UnifiedBinaryProperty.make(DERIVED | GraphemeExtend);
+ static UnicodeProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase);
+ static UnicodeProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink);
+
+ {
+ fileName = "Word";
+ TypeOrder = new byte[LIMIT];
+ for (byte i = 0; i < TypeOrder.length; ++i) {
+ TypeOrder[i] = i;
+ }
+ }
+
+ boolean skipType(byte type) {
+ return false;
+ }
+
+ public int getLimit() {
+ return LIMIT;
+ }
+
+ public int getTableLimit() {
+ return LIMIT;
+ }
+
+ // stuff that subclasses need to override
+ public int genTestItems(String before, String after, String[] results) {
+ results[0] = before + after;
+ return 1;
+ }
+
+ public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
+ boolean normalBreak = isBreak(before + after, before.length(), recommended);
+ String normalRule = rule;
+ ruleOut[0] = rule;
+ return normalBreak ? BREAK : NOBREAK;
+ }
+
+ // stuff that subclasses need to override
+ public String getTypeID(int cp) {
+ byte type = getType(cp);
+ if (type >= oLIMIT) return hNames[type - oLIMIT];
+ return Names[type];
+ }
+
+ // stuff that subclasses need to override
+ public byte getType(int cp) {
+ // single characters
+ if (cp == 0xA) return LF;
+ if (cp == 0xD) return CR;
+ if (cp == 0x034F) return CGJ;
+ if (cp == 0x2028 || cp == 0x2029) return Control;
+
+ // Hangul
+ byte result = getHangulType(cp);
+ if (result != hNot) return (byte)(result + oLIMIT);
+
+ // other properties
+ // category based
+ byte cat = Default.ucd.getCategory(cp);
+ if (cat == Cc) return Control;
+ if (cat == Cf) return Extend;
+ if (((1< source.length()) return false;
+ if (offset == 0) return true;
+
+ rule = "2";
+ if (offset == source.length()) return true;
+
+ // UTF-16: never break in the middle of a code point
+ if (!onCodepointBoundary(source, offset)) return false;
+
+ // now get the character before and after, and their types
+
+
+ int cpBefore = UTF16.charAt(source, offset-1);
+ int cpAfter = UTF16.charAt(source, offset);
+
+ byte before = getResolvedType(cpBefore, recommended);
+ byte after = getResolvedType(cpAfter, recommended);
+
+ rule = "3";
+ if (before == CR && after == LF) return false;
+
+ rule = "4";
+ if (before == CR || before == LF || before == Control
+ || after == Control || after == LF || after == CR) return true;
+
+ rule = "6";
+ if (before == L && (after == L || after == V || after == LV || after == LVT)) return false;
+
+ rule = "7";
+ if ((before == LV || before == V) && (after == V || after == T)) return false;
+
+ rule = "8";
+ if ((before == LVT || before == T) && (after == T)) return false;
+
+ rule = "9";
+ if (after == Extend) return false;
+
+ if (recommended) {
+ if (after == Link || after == CGJ) return false;
+ } else {
+
+ // Do not break around a CGJ.
+ rule = "10";
+ if (before == CGJ && (after == Base
+ || after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT)) return false;
+ rule = "11";
+ if (after == CGJ) return false;
+
+ // Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together.
+
+ rule = "12";
+ //Link Extend* × LetterBase (12)
+ if (after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT) {
+ int backOffset = findLastNon(source, offset, Extend, recommended);
+ if (backOffset >= 0) {
+ byte last = getResolvedType(UTF16.charAt(source, backOffset), recommended);
+ if (last == Link) return false;
+ }
+ }
+
+ rule = "13";
+ if (after == Link) return false;
+ }
+
+ // Otherwise break after all characters.
+ rule = "14";
+ return true;
+
+ }
+
+ }
}
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java b/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java
index 250266f9923..56f2e0fdc4d 100644
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java,v $
-* $Date: 2002/07/30 09:56:41 $
-* $Revision: 1.2 $
+* $Date: 2002/08/04 21:38:45 $
+* $Revision: 1.3 $
*
*******************************************************************************
*/
@@ -23,20 +23,23 @@ public class GenerateThaiBreaks {
BufferedReader br = new BufferedReader(
new InputStreamReader(
- new FileInputStream("\\icu4j\\src\\data\\thai6.ucs"), "UnicodeLittle"));
+ new FileInputStream("c:\\icu4j\\src\\com\\ibm\\icu\\dev\\data\\thai6.ucs"), "UnicodeLittle"));
PrintWriter out = null;
try {
Default.setUCD();
- UnicodeSet ignorables = new UnicodeSet(0xE30, 0xE3A);
+ UnicodeSet ignorables = new UnicodeSet();
+ /* new UnicodeSet(0xE30, 0xE3A);
ignorables.add(0x0E40, 0x0E44); // add logical order exception
ignorables.add(0x0E47, 0x0E4E);
+ */
ignorables.add(0, ' '); // add controls
ignorables.add('.');
- Set initials = new TreeSet();
- Set finals = new TreeSet();
- Set medials = new TreeSet();
+
+ UnicodeSet initials = new UnicodeSet();
+ UnicodeSet finals = new UnicodeSet();
+ UnicodeSet medials = new UnicodeSet();
char[] buffer = new char[100];
@@ -60,34 +63,58 @@ public class GenerateThaiBreaks {
}
initials.add(temp.substring(0,1));
- initials.add(temp.substring(0,2));
- finals.add(temp.substring(temp.length()-2));
+ //initials.add(temp.substring(0,2));
finals.add(temp.substring(temp.length()-1));
+ //finals.add(temp.substring(temp.length()-1));
- for (int i = 1; i < temp.length() - 3; ++i) {
- medials.add(temp.substring(i, i+2));
+ for (int i = 1; i < temp.length() - 1; ++i) {
+ //medials.add(temp.substring(i, i+2));
medials.add(temp.substring(i, i+1));
}
- medials.add(temp.substring(temp.length() - 2, temp.length() - 1));
+ //medials.add(temp.substring(temp.length() - 2, temp.length() - 1));
}
System.out.println("initials size: " + initials.size());
System.out.println("finals size: " + finals.size());
System.out.println("medials size: " + medials.size());
+ //out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS);
+ // out.write('\uFEFF');
+
+ UnicodeSet marks = new UnicodeSet("[[\u0e00-\u0e7f]&[[:mn:][:me:]]]");
+ finals.addAll(marks);
+
+ UnicodeSet all = new UnicodeSet(initials).addAll(medials).addAll(finals);
+
+ UnicodeSet missingThai = new UnicodeSet("[[\u0e00-\u0e7f]-[:Cn:]]").removeAll(all);
+
+ System.out.println("Never occur: " + missingThai.toPattern(true));
+ Utility.showSetNames("", missingThai, true, Default.ucd);
+ System.out.println();
+
+ UnicodeSet neverInitial = new UnicodeSet(all).removeAll(initials);
+ UnicodeSet neverFinal = new UnicodeSet(all).removeAll(finals);
+
+ System.out.println("Never initial: " + neverInitial.toPattern(true));
+ Utility.showSetNames("", neverInitial, true, Default.ucd);
+ System.out.println();
+
+ System.out.println("Never final: " + neverFinal.toPattern(true));
+ Utility.showSetNames("", neverFinal, true, Default.ucd);
+ System.out.println();
+
initials.removeAll(medials);
finals.removeAll(medials);
System.out.println("initials size: " + initials.size());
System.out.println("finals size: " + finals.size());
- out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS);
- out.write('\uFEFF');
- out.println("Only Initials");
- Utility.print(out, initials, ", ", new MyBreaker());
- out.println();
- out.println("Only Finals");
- Utility.print(out, finals, ", ", new MyBreaker());
+ System.out.println("Only Initials" + initials.toPattern(true));
+ Utility.showSetNames("", initials, true, Default.ucd);
+ System.out.println();
+
+ System.out.println("Only Finals" + finals.toPattern(true));
+ Utility.showSetNames("", finals, true, Default.ucd);
} finally {
br.close();
if (out != null) out.close();
diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java
index 40dfef6f186..68b630d8457 100644
--- a/tools/unicodetools/com/ibm/text/UCD/Main.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Main.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
-* $Date: 2002/07/30 09:56:41 $
-* $Revision: 1.19 $
+* $Date: 2002/08/04 21:38:45 $
+* $Revision: 1.20 $
*
*******************************************************************************
*/
@@ -78,7 +78,7 @@ public final class Main implements UCD_Types {
else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null);
- else if (arg.equalsIgnoreCase("linebreaktest")) GenerateLineBreakTest.main(null);
+ else if (arg.equalsIgnoreCase("breaktest")) GenerateBreakTest.main(null);
else if (arg.equalsIgnoreCase("genSplit")) GenerateData.genSplit();
else if (arg.equalsIgnoreCase("iana")) IANANames.testSensitivity();
diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java
index 2e6adf2cdbd..324ea0a7f43 100644
--- a/tools/unicodetools/com/ibm/text/UCD/UCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
-* $Date: 2002/07/30 09:56:40 $
-* $Revision: 1.16 $
+* $Date: 2002/08/04 21:38:45 $
+* $Revision: 1.17 $
*
*******************************************************************************
*/
@@ -737,6 +737,10 @@ public final class UCD implements UCD_Types {
return UCD_Names.NT[prop];
}
+ public static String getNumericTypeID_fromIndex(byte prop, byte style) {
+ return style == SHORT ? UCD_Names.SHORT_NT[prop] : UCD_Names.NT[prop];
+ }
+
public String getEastAsianWidthID(int codePoint) {
return getEastAsianWidthID_fromIndex(getEastAsianWidth(codePoint));
}
@@ -745,6 +749,10 @@ public final class UCD implements UCD_Types {
return UCD_Names.EA[prop];
}
+ public static String getEastAsianWidthID_fromIndex(byte prop, byte style) {
+ return style != LONG ? UCD_Names.SHORT_EA[prop] : UCD_Names.EA[prop];
+ }
+
public String getLineBreakID(int codePoint) {
return getLineBreakID_fromIndex(getLineBreak(codePoint));
}
@@ -753,6 +761,10 @@ public final class UCD implements UCD_Types {
return UCD_Names.LB[prop];
}
+ public static String getLineBreakID_fromIndex(byte prop, byte style) {
+ return style != LONG ? UCD_Names.LB[prop] : UCD_Names.LONG_LB[prop];
+ }
+
public String getJoiningTypeID(int codePoint) {
return getJoiningTypeID_fromIndex(getJoiningType(codePoint));
}
@@ -761,6 +773,10 @@ public final class UCD implements UCD_Types {
return UCD_Names.JOINING_TYPE[prop];
}
+ public static String getJoiningTypeID_fromIndex(byte prop, byte style) {
+ return style != LONG ? UCD_Names.JOINING_TYPE[prop] : UCD_Names.LONG_JOINING_TYPE[prop];
+ }
+
public String getJoiningGroupID(int codePoint) {
return getJoiningGroupID_fromIndex(getJoiningGroup(codePoint));
}
@@ -769,6 +785,11 @@ public final class UCD implements UCD_Types {
return UCD_Names.JOINING_GROUP[prop];
}
+ public static String getJoiningGroupID_fromIndex(byte prop, byte style) {
+ // no short version
+ return UCD_Names.JOINING_GROUP[prop];
+ }
+
public String getScriptID(int codePoint) {
return getScriptID_fromIndex(getScript(codePoint));
}
@@ -790,6 +811,11 @@ public final class UCD implements UCD_Types {
return UCD_Names.AGE[prop];
}
+ public static String getAgeID_fromIndex(byte prop, byte style) {
+ // no short for
+ return UCD_Names.AGE[prop];
+ }
+
public String getBinaryPropertiesID(int codePoint, byte bit) {
return (getBinaryProperties(codePoint) & (1<>8: return ucd.getCombiningClassID_fromIndex((byte)propValue, style);
case BIDI_CLASS>>8: return ucd.getBidiClassID_fromIndex((byte)propValue, style);
case DECOMPOSITION_TYPE>>8: return ucd.getDecompositionTypeID_fromIndex((byte)propValue, style);
- case NUMERIC_TYPE>>8: if (propValue >= LIMIT_NUMERIC_TYPE) break;
- if (style != SHORT) return ucd.getNumericTypeID_fromIndex((byte)propValue);
- return UCD_Names.SHORT_NT[propValue];
- case EAST_ASIAN_WIDTH>>8: if (propValue >= LIMIT_EAST_ASIAN_WIDTH) break;
- if (style != LONG) return ucd.getEastAsianWidthID_fromIndex((byte)propValue);
- return UCD_Names.SHORT_EA[propValue];
- case LINE_BREAK>>8: if (propValue >= LIMIT_LINE_BREAK) break;
- if (style != LONG) return ucd.getLineBreakID_fromIndex((byte)propValue);
- return UCD_Names.LONG_LB[propValue];
- case JOINING_TYPE>>8: if (propValue >= LIMIT_JOINING_TYPE) break;
- if (style != LONG) return ucd.getJoiningTypeID_fromIndex((byte)propValue);
- return UCD_Names.LONG_JOINING_TYPE[propValue];
- case JOINING_GROUP>>8: if (propValue >= LIMIT_JOINING_GROUP) break;
- return ucd.getJoiningGroupID_fromIndex((byte)propValue);
+ case NUMERIC_TYPE>>8: ucd.getNumericTypeID_fromIndex((byte)propValue, style);
+ case EAST_ASIAN_WIDTH>>8: return ucd.getEastAsianWidthID_fromIndex((byte)propValue);
+ case LINE_BREAK>>8: return ucd.getLineBreakID_fromIndex((byte)propValue, style);
+ case JOINING_TYPE>>8: return ucd.getJoiningTypeID_fromIndex((byte)propValue);
+ case JOINING_GROUP>>8: return ucd.getJoiningGroupID_fromIndex((byte)propValue);
case BINARY_PROPERTIES>>8: return ucd.getBinaryPropertiesID_fromIndex((byte)propValue, style);
- case SCRIPT>>8: if (propValue >= LIMIT_SCRIPT) break;
- if (style != SHORT) return ucd.getScriptID_fromIndex((byte)propValue);
- return UCD_Names.ABB_SCRIPT[propValue];
- case AGE>>8: if (propValue >= LIMIT_AGE) break;
- return ucd.getAgeID_fromIndex((byte)propValue);
+ case SCRIPT>>8: return ucd.getScriptID_fromIndex((byte)propValue);
+ case AGE>>8: return ucd.getAgeID_fromIndex((byte)propValue);
/*
case DERIVED>>8:
UnicodeProperty up = DerivedProperty.make(propValue, ucd);
diff --git a/tools/unicodetools/com/ibm/text/utility/Utility.java b/tools/unicodetools/com/ibm/text/utility/Utility.java
index 58c89c09797..67fc9fcb655 100644
--- a/tools/unicodetools/com/ibm/text/utility/Utility.java
+++ b/tools/unicodetools/com/ibm/text/utility/Utility.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
-* $Date: 2002/07/30 09:56:41 $
-* $Revision: 1.23 $
+* $Date: 2002/08/04 21:38:44 $
+* $Revision: 1.24 $
*
*******************************************************************************
*/
@@ -17,9 +17,10 @@ import java.util.*;
import java.text.*;
import java.io.*;
import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UTF16;
import com.ibm.text.UCD.*;
-public final class Utility { // COMMON UTILITIES
+public final class Utility implements UCD_Types { // COMMON UTILITIES
static final boolean UTF8 = true; // TODO -- make argument
@@ -470,7 +471,22 @@ public final class Utility { // COMMON UTILITIES
return quoteXML(source, false);
}
-
+ private static UnicodeProperty defaultIgnorable = null;
+
+ public static String getDisplay(int cp) {
+ String result = UTF16.valueOf(cp);
+ byte cat = Default.ucd.getCategory(cp);
+ if (cat == Mn || cat == Me) {
+ result = String.valueOf(DOTTED_CIRCLE) + result;
+ } else if (cat == Cf || cat == Cc || cp == 0x034F || cp == 0x00AD || cp == 0x1806) {
+ result = "\u25A1";
+ } else {
+ if (defaultIgnorable == null) defaultIgnorable = DerivedProperty.make(DefaultIgnorable);
+ if (defaultIgnorable.hasValue(cp)) result = "\u25A1";
+ }
+ return result;
+ }
+
public static int compare(char[] a, int aStart, int aEnd, char[] b, int bStart, int bEnd) {
while (aStart < aEnd && bStart < bEnd) {
int diff = a[aStart++] - b[bStart++];
---|