From c4392b5dab953e61840b3be27cf786947a720ca2 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Tue, 1 Apr 2003 02:53:07 +0000 Subject: [PATCH] Fixing Break Charts & Tests X-SVN-Rev: 11428 --- .../com/ibm/text/UCA/WriteCollationData.java | 21 +- .../com/ibm/text/UCD/DerivedProperty.java | 6 +- .../com/ibm/text/UCD/GenerateBreakTest.java | 796 ++++++++---------- .../com/ibm/text/UCD/UCD_Types.java | 10 +- .../com/ibm/text/UCD/UnicodeMap.java | 94 +++ .../com/ibm/text/utility/Utility.java | 13 +- 6 files changed, 487 insertions(+), 453 deletions(-) create mode 100644 tools/unicodetools/com/ibm/text/UCD/UnicodeMap.java diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java index 3769d09362c..eb9a820a4ac 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ -* $Date: 2003/03/19 23:31:12 $ -* $Revision: 1.30 $ +* $Date: 2003/04/01 02:51:57 $ +* $Revision: 1.31 $ * ******************************************************************************* */ @@ -2492,17 +2492,26 @@ F900..FAFF; CJK Compatibility Ideographs static StringBuffer quoteOperandBuffer = new StringBuffer(); // faster + static UnicodeSet needsQuoting = null; + static final String quoteOperand(String s) { + if (needsQuoting == null) { + /* + c >= 'a' && c <= 'z' + || c >= 'A' && c <= 'Z' + || c >= '0' && c <= '9' + || (c >= 0xA0 && !UCharacterProperty.isRuleWhiteSpace(c)) + */ + needsQuoting = new UnicodeSet("[a-zA-Z0-9\\u00A0-\\U00010FFF]"); + needsQuoting.remove(); + } s = NFC.normalize(s); quoteOperandBuffer.setLength(0); boolean noQuotes = true; boolean inQuote = false; for (int i = 0; i < s.length(); ++i) { char c = s.charAt(i); - if (c >= 'a' && c <= 'z' - || c >= 'A' && c <= 'Z' - || c >= '0' && c <= '9' - || (c >= 0xA0 && !UCharacterProperty.isRuleWhiteSpace(c))) { + if (!needsQuoting.contains(c)) { if (inQuote) { quoteOperandBuffer.append('\''); inQuote = false; diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java index 06033796ae8..f9265320df5 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java +++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $ -* $Date: 2003/03/12 16:01:26 $ -* $Revision: 1.19 $ +* $Date: 2003/04/01 02:52:00 $ +* $Revision: 1.20 $ * ******************************************************************************* */ @@ -501,7 +501,7 @@ public final class DerivedProperty implements UCD_Types { public boolean hasValue(int cp) { byte cat = ucdData.getCategory(cp); if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl - || ucdData.getBinaryProperty(cp, Alphabetic)) return true; + || ucdData.getBinaryProperty(cp, Other_Alphabetic)) return true; return false; } }; diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java index 5d2b3ff435a..bd7159fa5e4 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $ -* $Date: 2003/02/25 23:38:23 $ -* $Revision: 1.3 $ +* $Date: 2003/04/01 02:52:00 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -97,6 +97,8 @@ abstract public class GenerateBreakTest implements UCD_Types { // ====================== Main =========================== static final boolean SHOW_TYPE = false; + + UnicodeMap sampleMap = null; public static void main(String[] args) throws IOException { @@ -116,9 +118,9 @@ abstract public class GenerateBreakTest implements UCD_Types { PrintWriter systemPrintWriter = new PrintWriter(System.out); gwb.printLine(systemPrintWriter, "n\u0308't", true, true, false); systemPrintWriter.flush(); - showSet("sepSet", GenerateSentenceBreakTest.sepSet); - showSet("atermSet", GenerateSentenceBreakTest.atermSet); - showSet("termSet", GenerateSentenceBreakTest.termSet); + //showSet("sepSet", GenerateSentenceBreakTest.sepSet); + //showSet("atermSet", GenerateSentenceBreakTest.atermSet); + //showSet("termSet", GenerateSentenceBreakTest.termSet); } if (true) { @@ -127,13 +129,13 @@ abstract public class GenerateBreakTest implements UCD_Types { foo.isBreak("3.4", 2, true); } + new GenerateGraphemeBreakTest().run(); + new GenerateWordBreakTest().run(); + new GenerateLineBreakTest().run(); new GenerateSentenceBreakTest().run(); //if (true) return; // cut short for now - new GenerateLineBreakTest().run(); - new GenerateGraphemeBreakTest().run(); - new GenerateWordBreakTest().run(); } static void checkDecomps() { @@ -146,8 +148,8 @@ abstract public class GenerateBreakTest implements UCD_Types { tests[0].isBreak("\u0300\u0903", 1, true); System.out.println("Check Decomps"); - System.out.println("otherExtendSet: " + ((GenerateGraphemeBreakTest)tests[0]).otherExtendSet.toPattern(true)); - Utility.showSetNames("", ((GenerateGraphemeBreakTest)tests[0]).otherExtendSet, false, Default.ucd); + //System.out.println("otherExtendSet: " + ((GenerateGraphemeBreakTest)tests[0]).otherExtendSet.toPattern(true)); + //Utility.showSetNames("", ((GenerateGraphemeBreakTest)tests[0]).otherExtendSet, false, Default.ucd); for (int k = 0; k < tests.length; ++k) { for (int i = 0; i < 0x10FFFF; ++i) { @@ -223,6 +225,8 @@ abstract public class GenerateBreakTest implements UCD_Types { } return result; } + + static void generateTerminalClosure() { UnicodeSet terminals = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Terminal_Punctuation).getSet(); @@ -274,10 +278,16 @@ abstract public class GenerateBreakTest implements UCD_Types { //printLine(out, samples[LB_ZW], "", samples[LB_CL]); //printLine(out, samples[LB_ZW], " ", samples[LB_CL]); - PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS); - out.println("" - + fileName + " Break Chart"); - out.println("

Current:

"); + PrintWriter out = Utility.openPrintWriter("TR29\\" + fileName + "BreakTest.html", Utility.UTF8_WINDOWS); + out.println(""); + out.println("" + fileName + " Break Chart"); + out.println(""); + + + out.println("

Sample Break Table

"); + out.println("

Version: " + Default.ucd.getVersion() + "

"); if (recommendedDiffers()) { @@ -288,20 +298,33 @@ abstract public class GenerateBreakTest implements UCD_Types { } else { generateTable(out, true); } + + + if (sampleMap != null) { + out.println("

Character Type Breakdown

"); + out.println(""); + for (int i = 0; i < sampleMap.size(); ++i) { + out.println(""); + } + out.println("
" + sampleMap.getLabelFromIndex(i) + + "" + sampleMap.getSetFromIndex(i) + + "
"); + } + out.close(); if (recommendedDiffers()) { generateTest(false, false); } generateTest(false, true); - + } public void generateTest(boolean shortVersion, boolean recommended) throws IOException { String[] testCase = new String[50]; // do main test - PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest" + PrintWriter out = Utility.openPrintWriter("TR29\\" + fileName + "BreakTest" + (recommended & recommendedDiffers() ? "_NEW" : "") + (shortVersion ? "_SHORT" : "") + ".txt", Utility.UTF8_WINDOWS); @@ -436,7 +459,7 @@ abstract public class GenerateBreakTest implements UCD_Types { } public void generateTable(PrintWriter out, boolean recommended) { - String width = "width='" + (100 / (tableLimit + 2)) + "%'"; + String width = "width='" + (100 / (tableLimit + 1)) + "%'"; out.print(""); String types = ""; String codes = ""; @@ -445,12 +468,12 @@ abstract public class GenerateBreakTest implements UCD_Types { if (after == null) continue; String h = getTypeID(after, recommended); - types += ""; - codes += ""; + types += ""; + //codes += ""; } - out.println("" + types + ""); - out.println("" + codes + ""); + out.println("" + types + ""); + // out.println("" + codes + ""); String[] rule = new String[1]; String[] rule2 = new String[1]; @@ -458,8 +481,8 @@ abstract public class GenerateBreakTest implements UCD_Types { String before = samples[type]; if (before == null) continue; - String line = "" - + ""; + String line = ""; for (int type2 = 0; type2 < tableLimit; ++type2) { String after = samples[type2]; @@ -482,6 +505,8 @@ abstract public class GenerateBreakTest implements UCD_Types { out.println(line + ""); } out.println("
" + h + "" + Utility.hex(after) + "" + h + "" + Utility.hex(after) + "
" + getTypeID(before, recommended) + "" + Utility.hex(before) + "
" + + getTypeID(before, recommended) + "
"); + out.println("

Sample Strings

"); + out.println("
    "); for (int ii = 0; ii < extraSingleSamples.length; ++ii) { out.println("
  1. "); @@ -622,6 +647,9 @@ abstract public class GenerateBreakTest implements UCD_Types { return -1; } + public static UnicodeSet getSet(int prop, byte propValue) { + return UnifiedBinaryProperty.make(prop | propValue).getSet(); + } static public class Context { public int cpBefore2, cpBefore, cpAfter, cpAfter2; @@ -635,7 +663,7 @@ abstract public class GenerateBreakTest implements UCD_Types { } } - public void getGraphemeBases(String source, int offset, boolean recommended, byte ignoreType, Context context) { + public void getGraphemeBases(String source, int offset, boolean recommended, int ignoreType, Context context) { context.cpBefore2 = context.cpBefore = context.cpAfter = context.cpAfter2 = -1; context.tBefore2 = context.tBefore = context.tAfter = context.tAfter2 = -1; //if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(source) + "; " + offset + "; " + ignoreType); @@ -677,6 +705,244 @@ abstract public class GenerateBreakTest implements UCD_Types { } + //============================================== + + static class GenerateGraphemeBreakTest extends GenerateBreakTest { + + + static final UnicodeMap map = new UnicodeMap(); + static final int + CR = map.add("CR", new UnicodeSet(0xA, 0xA)), + LF = map.add("LF", new UnicodeSet(0xD, 0xD)), + Control = map.add("Control", + getSet(CATEGORY, Cc) + .addAll(getSet(CATEGORY, Cf)) + .addAll(getSet(CATEGORY, Zp)) + .addAll(getSet(CATEGORY, Zl)) + .removeAll(map.getSetFromIndex(CR)) + .removeAll(map.getSetFromIndex(LF))), + Extend = map.add("Extend", getSet(DERIVED, GraphemeExtend)), + L = map.add("L", getSet(HANGUL_SYLLABLE_TYPE, UCD_Types.L)), + V = map.add("V", getSet(HANGUL_SYLLABLE_TYPE, UCD_Types.V)), + T = map.add("T", getSet(HANGUL_SYLLABLE_TYPE, UCD_Types.T)), + LV = map.add("LV", getSet(HANGUL_SYLLABLE_TYPE, UCD_Types.LV)), + LVT = map.add("LVT", getSet(HANGUL_SYLLABLE_TYPE, UCD_Types.LVT)), + Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false); + + { + fileName = "GraphemeCluster"; + sampleMap = map; + } + + // stuff that subclasses need to override + public String getTypeID(int cp, boolean recommended) { + return map.getLabel(cp); + } + + // stuff that subclasses need to override + public byte getType(int cp, boolean recommended) { + return (byte) map.getIndex(cp); + } + + public boolean isBreak(String source, int offset, boolean recommended) { + recommended = true; // don't care about old stuff + rule="1: sot ÷"; + if (offset < 0 || offset > source.length()) return false; + if (offset == 0) return true; + + rule = "2: ÷ eot"; + if (offset == source.length()) return true; + + // UTF-16: never break in the middle of a code point + if (!onCodepointBoundary(source, offset)) return false; + + // now get the character before and after, and their types + + + int cpBefore = UTF16.charAt(source, offset-1); + int cpAfter = UTF16.charAt(source, offset); + + byte before = getResolvedType(cpBefore, recommended); + byte after = getResolvedType(cpAfter, recommended); + + rule = "3: CR × LF"; + if (before == CR && after == LF) return false; + + rule = "4: ( Control | CR | LF ) ÷"; + if (before == CR || before == LF || before == Control) return true; + + rule = "5: ÷ ( Control | CR | LF )"; + if (after == Control || after == LF || after == CR) return true; + + rule = "6: L × ( L | V | LV | LVT )"; + if (before == L && (after == L || after == V || after == LV || after == LVT)) return false; + + rule = "7: ( LV | V ) × ( V | T )"; + if ((before == LV || before == V) && (after == V || after == T)) return false; + + rule = "8: ( LVT | T ) × T"; + if ((before == LVT || before == T) && (after == T)) return false; + + rule = "9: × Extend"; + if (after == Extend) return false; + + // Otherwise break after all characters. + rule = "10: Any ÷ Any"; + return true; + + } + + } + + //============================================== + + static class GenerateWordBreakTest extends GenerateBreakTest { + + //static String LENGTH = "[\u30FC\uFF70]"; + //static String HALFWIDTH_KATAKANA = "[\uFF66-\uFF9F]"; + //static String KATAKANA_ITERATION = "[\u30FD\u30FE]"; + //static String HIRAGANA_ITERATION = "[\u309D\u309E]"; + + static final UnicodeMap map = new UnicodeMap(); + static final int + Format = map.add("Format", getSet(CATEGORY, Cf).remove(0x00AD)), + Katakana = map.add("Katakana", getSet(SCRIPT, KATAKANA_SCRIPT) + .addAll(new UnicodeSet("[\u30FC\uFF70\uFF9E\uFF9F]")) + //.addAll(new UnicodeSet(HALFWIDTH_KATAKANA)) + //.addAll(new UnicodeSet(KATAKANA_ITERATION)) + ), + ALetter = map.add("ALetter", + getSet(DERIVED, PropAlphabetic) + .add(0x05F3, 0x05F3) + .removeAll(map.getSetFromIndex(Katakana)) + .removeAll(getSet(BINARY_PROPERTIES, Ideographic)) + .removeAll(getSet(SCRIPT, THAI_SCRIPT)) + .removeAll(getSet(SCRIPT, LAO_SCRIPT)) + .removeAll(getSet(SCRIPT, HIRAGANA_SCRIPT)) + ), + MidLetter = map.add("MidLetter", + new UnicodeSet("[\\u0027\\u00AD\\u00B7\\u05f4\\u05F4\\u2019\\u2027]")), + MidNumLet = map.add("MidNumLet", + new UnicodeSet("[\\u002E\\u003A]")), + MidNum = map.add("MidNum", getSet(LINE_BREAK, LB_IN) + .removeAll(map.getSetFromIndex(MidNumLet))), + Numeric = map.add("Numeric", getSet(LINE_BREAK, LB_NU)), + Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false); + + + + static GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest(); + static Context context = new Context(); + + { + fileName = "Word"; + sampleMap = map; + extraSamples = new String[] { + "\uFF70", "\uFF65", "\u30FD", "a\u2060", "a:", "a'", "a'\u2060", "a,", "1:", "1'", "1,", "1.\u2060" + }; + + String [] temp = {"can't", "can\u2019t", "ab\u00ADby", "a$-34,567.14%b", "3a" }; + extraSingleSamples = new String [temp.length * 2]; + System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length); + for (int i = 0; i < temp.length; ++i) { + extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme); + } + + if (false) Utility.showSetDifferences("Katakana", map.getSetFromIndex(Katakana), + "Script=Katakana", getSet(SCRIPT, KATAKANA_SCRIPT), false, Default.ucd); + + } + + // stuff that subclasses need to override + public String getTypeID(int cp, boolean recommended) { + return map.getLabel(cp); + } + + // stuff that subclasses need to override + public byte getType(int cp, boolean recommended) { + return (byte) map.getIndex(cp); + } + + public int genTestItems(String before, String after, String[] results) { + results[0] = before + after; + results[1] = 'a' + before + "\u0301\u0308" + after + "\u0301\u0308" + 'a'; + results[2] = 'a' + before + "\u0301\u0308" + samples[MidLetter] + after + "\u0301\u0308" + 'a'; + results[3] = 'a' + before + "\u0301\u0308" + samples[MidNum] + after + "\u0301\u0308" + 'a'; + return 3; + } + + public boolean isBreak(String source, int offset, boolean recommended) { + recommended = true; // don't care about old stuff + + rule = "1: sot ÷"; + if (offset < 0 || offset > source.length()) return false; + + if (offset == 0) return true; + + rule = "2: ÷ eot"; + if (offset == source.length()) return true; + + // Treat a grapheme cluster as if it were a single character: + // the first base character, if there is one; otherwise the first character. + // GC => FB + + rule="3: GC -> FB; 4: X Format* -> X"; + if (!grapheme.isBreak( source, offset, recommended)) return false; + + // now get the base character before and after, and their types + + getGraphemeBases(source, offset, recommended, Format, context); + + byte before = context.tBefore; + byte after = context.tAfter; + byte before2 = context.tBefore2; + byte after2 = context.tAfter2; + + //Don't break between most letters + + rule = "5: ALetter × ALetter"; + if (before == ALetter && after == ALetter) return false; + + // Don’t break letters across certain punctuation + + rule = "6: ALetter × (MidLetter | MidNumLet) ALetter"; + if (before == ALetter && (after == MidLetter || after == MidNumLet) && after2 == ALetter) return false; + + rule = "7: ALetter (MidLetter | MidNumLet) × ALetter"; + if (before2 == ALetter && (before == MidLetter || before == MidNumLet) && after == ALetter) return false; + + // Don’t break within sequences of digits, or digits adjacent to letters. + + rule = "8: Numeric × Numeric"; + if (before == Numeric && after == Numeric) return false; + + rule = "9: ALetter × Numeric"; + if (before == ALetter && after == Numeric) return false; + + rule = "10: Numeric × ALetter"; + if (before == Numeric && after == ALetter) return false; + + + // Don’t break within sequences like: '-3.2' + rule = "11: Numeric (MidNum | MidNumLet) × Numeric"; + if (before2 == Numeric && (before == MidNum || before == MidNumLet) && after == Numeric) return false; + + rule = "12: Numeric × (MidNum | MidNumLet) Numeric"; + if (before == Numeric && (after == MidNum || after == MidNumLet) && after2 == Numeric) return false; + + // Don't break between Katakana + + rule = "13: Katakana × Katakana"; + if (before == Katakana && after == Katakana) return false; + + // Otherwise break always. + rule = "14: Any ÷ Any"; + return true; + + } + + } + // ======================================== static class GenerateLineBreakTest extends GenerateBreakTest { @@ -1070,399 +1336,56 @@ abstract public class GenerateBreakTest implements UCD_Types { //============================================== - static class GenerateGraphemeBreakTest extends GenerateBreakTest { - - static final byte CR = 0, LF = 1, Control = 2, Extend = 3, Link = 4, CGJ = 5, Base = 6, LetterBase = 7, Other = 8, - oLIMIT = 9, // RESET THIS IF LIST ABOVE CHANGES! - L = oLIMIT + hL, V = oLIMIT + hV, T = oLIMIT + hT, LV = oLIMIT + hLV, LVT = oLIMIT + hLVT, - LIMIT = LVT + 1; - - static final String[] Names = {"CR", "LF", "CTL", "Extend", "Link", "CGJ", "Base", "LetterBase", "Other" }; - - static UnicodeProperty extendProp = UnifiedBinaryProperty.make(DERIVED | GraphemeExtend); - static UnicodeProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase); - static UnicodeProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink); - static UnicodeSet otherExtendSet = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Other_GraphemeExtend).getSet() - .addAll(new UnicodeSet("[\u09BE\u09D7\u0B3E\u0B57\u0BD7\u0BBE" - + "\u0CC2\u0CD5\u0CD6\u0D3E\u0D57\u0DCF\u0DDF\\U0001D165\\U0001D16E\\U0001D16F]")); - - { - fileName = "GraphemeCluster"; - } - - // stuff that subclasses need to override - public String getTypeID(int cp, boolean recommended) { - byte type = getType(cp, recommended); - if (type >= oLIMIT) return hNames[type - oLIMIT]; - return Names[type]; - } - - // stuff that subclasses need to override - public byte getType(int cp, boolean recommended) { - recommended = true; // don't care about old stuff - // single characters - if (cp == 0xA) return LF; - if (cp == 0xD) return CR; - if (recommended) { - if (cp == 0x034F) return Extend; - } - if (cp == 0x2028 || cp == 0x2029) return Control; - - // Hangul - byte result = getHangulType(cp); - if (result != hNot) return (byte)(result + oLIMIT); - - // other properties - // category based - byte cat = Default.ucd.getCategory(cp); - if (cat == Cc) return Control; - if (recommended) { - if (cat == Cf) return Control; - if (cat == Me || cat == Mn) return Extend; - // FOR FUTURE! if (otherExtendSet.contains(cp)) return Extend; - return Base; - } - if (cat == Cf) return Extend; - - if (((1< source.length()) return false; - if (offset == 0) return true; - - rule = "2"; - if (offset == source.length()) return true; - - // UTF-16: never break in the middle of a code point - if (!onCodepointBoundary(source, offset)) return false; - - // now get the character before and after, and their types - - - int cpBefore = UTF16.charAt(source, offset-1); - int cpAfter = UTF16.charAt(source, offset); - - byte before = getResolvedType(cpBefore, recommended); - byte after = getResolvedType(cpAfter, recommended); - - rule = "3"; - if (before == CR && after == LF) return false; - - rule = "4"; - if (before == CR || before == LF || before == Control - || after == Control || after == LF || after == CR) return true; - - rule = "6"; - if (before == L && (after == L || after == V || after == LV || after == LVT)) return false; - - rule = "7"; - if ((before == LV || before == V) && (after == V || after == T)) return false; - - rule = "8"; - if ((before == LVT || before == T) && (after == T)) return false; - - rule = "9"; - if (after == Extend) return false; - - if (recommended) { - if (after == Link || after == CGJ) return false; - } else { - - // Do not break around a CGJ. - rule = "10"; - if (before == CGJ && (after == Base - || after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT)) return false; - rule = "11"; - if (after == CGJ) return false; - - // Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together. - - rule = "12"; - //Link Extend* × LetterBase (12) - if (after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT) { - int backOffset = findLastNon(source, offset, Extend, recommended); - if (backOffset >= 0) { - byte last = getResolvedType(UTF16.charAt(source, backOffset), recommended); - if (last == Link) return false; - } - } - - rule = "13"; - if (after == Link) return false; - } - - // Otherwise break after all characters. - rule = "14"; - return true; - - } - - } - - //============================================== - - static class GenerateWordBreakTest extends GenerateBreakTest { - - static final byte Format = 0, Katakana = 1, ALetter = 2, MidLetter = 3, Hyphen = 4, - Numeric = 5, Infix_Numeric = 6, Prefix_Numeric = 7, Postfix_Numeric = 8, - Prefix = 9, Postfix = 10, MidNumLet = 11, Hiragana = 12, Other = 13, - LIMIT = Other + 1; - - static final String[] Names = {"Format", "Katakana", "ALetter", "MidLetter", "Hyphen", - "Numeric", "INum", "PrNum", "PoNum", "PreLet", "PostLet", "MidNumLet", "Hiragana", "Other" }; - - static GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest(); - static Context context = new Context(); - - static String LENGTH = "[\u30FC\uFF70]"; - static String HALFWIDTH_KATAKANA = "[\uFF65-\uFF9F]"; - static String KATAKANA_ITERATION = "[\u30FD\u30FE]"; - static String HIRAGANA_ITERATION = "[\u309D\u309E]"; - - static UnicodeSet extraKatakana = new UnicodeSet("[" + LENGTH + HALFWIDTH_KATAKANA + KATAKANA_ITERATION + "]"); - - static UnicodeProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase); - static UnicodeProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink); - - static UnicodeSet prefixSet = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Logical_Order_Exception).getSet(); - static UnicodeSet postfixSet; - static UnicodeSet exceptionLetters; - static UnicodeSet normalLetters; - static UnicodeSet thaiLaoNormal; - - static UnicodeSet marks = UnifiedBinaryProperty.make(CATEGORY | Mn).getSet() - .addAll(UnifiedBinaryProperty.make(CATEGORY | Me).getSet()); - - static UnicodeSet oughtToBeLm = new UnicodeSet("[\u02B9-\u02BA\u02C2-\u02CF\u02D2-\u02DF\u02E5-\u02ED]"); -/* -U+02B9..U+02BA # MODIFIER LETTER PRIME..MODIFIER LETTER DOUBLE PRIME -U+02C2..U+02CF # MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LETTER LOW ACUTE ACCENT -U+02D2..U+02DF # MODIFIER LETTER CENTRED RIGHT HALF RING..MODIFIER LETTER CROSS ACCE -U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRATED -*/ - - static UnicodeSet letterSet = UnifiedBinaryProperty.make(CATEGORY | Lo).getSet() - .addAll(UnifiedBinaryProperty.make(CATEGORY | Lu).getSet()) - .addAll(UnifiedBinaryProperty.make(CATEGORY | Lt).getSet()) - .addAll(UnifiedBinaryProperty.make(CATEGORY | Ll).getSet()) - .addAll(UnifiedBinaryProperty.make(CATEGORY | Lm).getSet()) - .addAll(UnifiedBinaryProperty.make(CATEGORY | Mc).getSet()) - .addAll(oughtToBeLm); - - { - - UnicodeSet linebreakNS = UnifiedBinaryProperty.make(LINE_BREAK | LB_NS).getSet(); - - postfixSet = new UnicodeSet(linebreakNS) - .retainAll(letterSet) - .addAll(new UnicodeSet("[\u0e30-\u0E3A\u0e45-\u0e4e]")) - .addAll(new UnicodeSet("[[\u0eb0-\u0EBd\u0ec6-\u0ece]-[:Cn:]]")) - .removeAll(new UnicodeSet("[:mn:]")) - .removeAll(new UnicodeSet("[:katakana:]").addAll(extraKatakana)); - - thaiLaoNormal = new UnicodeSet("[[:thai:][:lao:]]").removeAll(prefixSet).removeAll(postfixSet); - - // we want ideographics, hiragana, thai (except prefix/suffix) - //UnicodeSet compatIdeographics = new UnicodeSet("[\uf900-\ufa6a\\U0002F800-\\U0002FA1D]"); - - UnicodeSet hiragana = UnifiedBinaryProperty.make(SCRIPT | HIRAGANA_SCRIPT).getSet(); - UnicodeSet smallHiragana = new UnicodeSet(hiragana).retainAll(linebreakNS); - - - UnicodeSet missingKatakana = new UnicodeSet(extraKatakana).removeAll(new UnicodeSet("[:katakana:]")); - - if (DEBUG) { - System.out.println("missingKatakana: " + missingKatakana.toPattern(true)); - Utility.showSetNames("", missingKatakana, false, Default.ucd); - - - System.out.println("oughtToBeLm: " + oughtToBeLm.toPattern(true)); - Utility.showSetNames("", oughtToBeLm, false, Default.ucd); - - - System.out.println("Prefix: " + prefixSet.toPattern(true)); - Utility.showSetNames("", prefixSet, false, Default.ucd); - - System.out.println("Postfix: " + postfixSet.toPattern(true)); - Utility.showSetNames("", postfixSet, false, Default.ucd); - - System.out.println("exceptionLetters: " + exceptionLetters.toPattern(true)); - - System.out.println("hiragana: " + hiragana.toPattern(true)); - - System.out.println("smallHiragana: " + hiragana.toPattern(true)); - Utility.showSetNames("", smallHiragana, true, Default.ucd); - - System.out.println("midLetterSet: " + midLetterSet.toPattern(true)); - Utility.showSetNames("", midLetterSet, true, Default.ucd); - } - - - fileName = "Word"; - extraSamples = new String[] { - "\uFF70", "\uFF65", "\u30FD", "a\u2060", "a:", "a'", "a'\u2060", "a,", "1:", "1'", "1,", "1.\u2060" - }; - if (DEBUG) { - System.out.println("length not covered: " - + new UnicodeSet(LENGTH).removeAll(new UnicodeSet("[:katakana:]")).toPattern(true)); - System.out.println("half-width not covered: " - + new UnicodeSet(HALFWIDTH_KATAKANA).removeAll(new UnicodeSet("[:katakana:]")).toPattern(true)); - System.out.println("k.iteration not covered: " - + new UnicodeSet(KATAKANA_ITERATION).removeAll(new UnicodeSet("[:katakana:]")).toPattern(true)); - System.out.println("h.iteration not covered: " - + new UnicodeSet(HIRAGANA_ITERATION).removeAll(new UnicodeSet("[:hiragana:]")).toPattern(true)); - System.out.println("L1: " + getTypeID('\u30FC')); - System.out.println("L2: " + getTypeID('\uFF70')); - } - - String [] temp = {"can't", "can\u2019t", "ab\u00ADby", "a$-34,567.14%b", "3a" }; - extraSingleSamples = new String [temp.length * 2]; - System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length); - for (int i = 0; i < temp.length; ++i) { - extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme); - } - - } - - // stuff that subclasses need to override - public String getTypeID(int cp, boolean recommended) { - byte type = getType(cp, recommended); - return Names[type]; - } - - // stuff that subclasses need to override - public byte getType(int cp, boolean recommended) { - byte cat = Default.ucd.getCategory(cp); - - if (cat == Cf) return Format; - - byte script = Default.ucd.getScript(cp); - - if (script == KATAKANA_SCRIPT) return Katakana; - if (extraKatakana.contains(cp)) return Katakana; - - if (script == HIRAGANA_SCRIPT || script == THAI_SCRIPT || script == LAO_SCRIPT) return Other; - if (ideographicSet.contains(cp)) return Other; - - if (alphabeticSet.contains(cp)) return ALetter; - - byte lb = Default.ucd.getLineBreak(cp); - - if (lb == LB_NU) return Numeric; - - if (midLetterSet.contains(cp)) { - if (lb == LB_IS) return MidNumLet; - return MidLetter; - } - if (lb == LB_IS) return Infix_Numeric; - - return Other; - } - - public int genTestItems(String before, String after, String[] results) { - results[0] = before + after; - results[1] = 'a' + before + "\u0301\u0308" + after + "\u0301\u0308" + 'a'; - results[2] = 'a' + before + "\u0301\u0308" + samples[MidLetter] + after + "\u0301\u0308" + 'a'; - results[3] = 'a' + before + "\u0301\u0308" + samples[Infix_Numeric] + after + "\u0301\u0308" + 'a'; - return 3; - } - - public boolean isBreak(String source, int offset, boolean recommended) { - recommended = true; // don't care about old stuff - - rule = "1"; - if (offset < 0 || offset > source.length()) return false; - - if (offset == 0) return true; - - rule = "2"; - if (offset == source.length()) return true; - - // Treat a grapheme cluster as if it were a single character: - // the first base character, if there is one; otherwise the first character. - // GC => FB - - rule="3"; - if (!grapheme.isBreak( source, offset, recommended)) return false; - - // now get the base character before and after, and their types - - getGraphemeBases(source, offset, recommended, Format, context); - - byte before = context.tBefore; - byte after = context.tAfter; - byte before2 = context.tBefore2; - byte after2 = context.tAfter2; - - //Don't break between most letters - // ALetter × ALetter - - rule = "5"; - if (before == ALetter && after == ALetter) return false; - - // Don’t break letters across certain punctuation - // ALetter×(MidLetter | MidNumLet) ALetter(6) - // ALetter (MidLetter | MidNumLet)×ALetter(7) - - rule = "6"; - if (before == ALetter && (after == MidLetter || after == MidNumLet) && after2 == ALetter) return false; - - rule = "7"; - if (before2 == ALetter && (before == MidLetter || before == MidNumLet) && after == ALetter) return false; - - // Don’t break within sequences of digits, or digits adjacent to letters. - - // Numeric × Numeric (5) - rule = "8"; - if (before == Numeric && after == Numeric) return false; - - // ALetter × Numeric (6) - rule = "9"; - if (before == ALetter && after == Numeric) return false; - - // Numeric × ALetter (7) - rule = "10"; - if (before == Numeric && after == ALetter) return false; - - - // Don’t break within sequences like: '-3.2' - // Numeric (MidNum | MidNumLet)×Numeric(11) - rule = "11"; - if (before2 == Numeric && (before == Infix_Numeric || before == MidNumLet) && after == Numeric) return false; - - // Numeric×(MidNum | MidNumLet) Numeric(12) - rule = "12"; - if (before == Numeric && (after == Infix_Numeric || after == MidNumLet) && after2 == Numeric) return false; - - // Don't break between Hiragana - - // Hiragana × Hiragana (13) - rule = "13"; - if (before == Hiragana && after == Hiragana) return false; - - // Otherwise break always. - rule = "14"; - return true; - - } - - } - - //============================================== - static class GenerateSentenceBreakTest extends GenerateBreakTest { + static final UnicodeMap map = new UnicodeMap(); + static final int + Sep = map.add("Sep", new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]")), + Format = map.add("Format", getSet(CATEGORY, Cf)), + Sp = map.add("Sp", getSet(BINARY_PROPERTIES, White_space) + .removeAll(map.getSetFromIndex(Sep))), + Lower = map.add("Lower", getSet(DERIVED, PropLowercase)), + Upper = map.add("Upper", getSet(CATEGORY, Lt) + .addAll(getSet(DERIVED, PropUppercase))), + OLetter = map.add("OLetter", + getSet(DERIVED, PropAlphabetic) + .add(0x05F3, 0x05F3) + .removeAll(map.getSetFromIndex(Lower)) + .removeAll(map.getSetFromIndex(Upper)) + ), + Numeric = map.add("Numeric", getSet(LINE_BREAK, LB_NU)), + ATerm = map.add("ATerm", new UnicodeSet(0x002E,0x002E)), + Term = map.add("Term", new UnicodeSet( + "[\\u0021\\u003F\\u0589\\u061F\\u06D4\\u0700\\u0701\\u0702\\u0964\\u1362\\u1367" + + "\\u1368\\u104A\\u104B\\u166E\\u1803\\u1809\\u203C\\u203D\\u2047\\u2048\\u2049" + + "\\u3002\\uFE52\\uFE57\\uFF01\\uFF0E\\uFF1F\\uFF61]")), + Close = map.add("Close", + getSet(CATEGORY, Po) + .addAll(getSet(CATEGORY, Pe)) + .addAll(getSet(LINE_BREAK, LB_QU)) + .removeAll(map.getSetFromIndex(ATerm)) + .removeAll(map.getSetFromIndex(Term)) + .remove(0x05F3) + ), + Other = map.add("Other", new UnicodeSet(0,0x10FFFF), false, false); + + { + fileName = "GraphemeCluster"; + sampleMap = map; + } + + // stuff that subclasses need to override + public String getTypeID(int cp, boolean recommended) { + return map.getLabel(cp); + } + + // stuff that subclasses need to override + public byte getType(int cp, boolean recommended) { + return (byte) map.getIndex(cp); + } + + + /* static final byte Format = 0, Sep = 1, Sp = 2, OLetter = 3, Lower = 4, Upper = 5, Numeric = 6, Close = 7, ATerm = 8, Term = 9, Other = 10, LIMIT = Other + 1; @@ -1470,7 +1393,6 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT static final String[] Names = {"Format", "Sep", "Sp", "OLetter", "Lower", "Upper", "Numeric", "Close", "ATerm", "Term", "Other" }; - static GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest(); static UnicodeSet sepSet = new UnicodeSet("[\\u000a\\u000d\\u0085\\u2029\\u2028]"); static UnicodeSet atermSet = new UnicodeSet("[\\u002E]"); @@ -1484,7 +1406,9 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT static UnicodeProperty uppercaseProp = UnifiedBinaryProperty.make(DERIVED | PropUppercase); UnicodeSet linebreakNS = UnifiedBinaryProperty.make(LINE_BREAK | LB_NU).getSet(); + */ + static GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest(); { fileName = "Sentence"; @@ -1515,7 +1439,7 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT extraSingleSamples = temp; } - + /* // stuff that subclasses need to override public String getTypeID(int cp, boolean recommended) { byte type = getType(cp, recommended); @@ -1539,13 +1463,14 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT || Default.ucd.getLineBreak(cp) == LB_QU) return Close; return Other; } - + */ + public int genTestItems(String before, String after, String[] results) { results[0] = before + after; /* results[1] = 'a' + before + "\u0301\u0308" + after + "\u0301\u0308" + 'a'; results[2] = 'a' + before + "\u0301\u0308" + samples[MidLetter] + after + "\u0301\u0308" + 'a'; - results[3] = 'a' + before + "\u0301\u0308" + samples[Infix_Numeric] + after + "\u0301\u0308" + 'a'; + results[3] = 'a' + before + "\u0301\u0308" + samples[MidNum] + after + "\u0301\u0308" + 'a'; */ return 1; } @@ -1553,17 +1478,17 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT static Context context = new Context(); public boolean isBreak(String source, int offset, boolean recommended) { - - rule = "1"; + + // Break at the start and end of text. + rule = "1: sot ÷"; if (offset < 0 || offset > source.length()) return false; if (offset == 0) return true; - rule = "2"; + rule = "2: ÷ eot"; if (offset == source.length()) return true; - // Sep ÷ (3) - rule = "3"; + rule = "3: Sep ÷"; byte beforeChar = getResolvedType(source.charAt(offset-1), recommended); if (beforeChar == Sep) return true; @@ -1576,7 +1501,7 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT // X // (5) - rule="3"; + rule="4: GC -> FB; 5: X Format* -> X"; if (!grapheme.isBreak( source, offset, recommended)) return false; getGraphemeBases(source, offset, recommended, Format, context); @@ -1589,13 +1514,10 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT // Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter, is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence. - // ATerm × (Lower | Numeric) (6) - // Upper ATerm × Upper (7) - if (before == ATerm) { - rule = "6"; + rule = "6: ATerm × ( Numeric | Lower )"; if (after == Lower || after == Numeric) return false; - rule = "7"; + rule = "7: Upper ATerm × Upper"; if (DEBUG_GRAPHEMES) System.out.println(context + ", " + Upper); if (before2 == Upper && after == Upper) return false; } @@ -1608,7 +1530,7 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT graphemeIterator.set(source, offset); int state = 0; - byte lookAfter = -1; + int lookAfter = -1; int cp; byte t; boolean gotSpace = false; @@ -1655,7 +1577,7 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT if (lookAfter == -1) { // Otherwise, do not break // Any × Any (11) - rule = "12"; + rule = "12: Any × Any"; return false; } @@ -1682,16 +1604,16 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT if (isFirst) { isFirst = false; if (lookAfter == ATerm && t == Upper) { - rule = "8"; + rule = "8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower"; return false; } if (gotSpace) { if (t == Sp || t == Sep) { - rule = "10"; + rule = "10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )"; return false; } } else if (t == Close || t == Sp || t == Sep) { - rule = "9"; + rule = "9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )"; return false; } if (lookAfter == Term) break; @@ -1700,12 +1622,12 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT // at this point, we have an ATerm. All other conditions are ok, but we need to verify 6 if (t != OLetter && t != Upper && t != Lower) continue; if (t == Lower) { - rule = "8"; + rule = "8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower"; return false; } break; } - rule = "11"; + rule = "11: ( Term | ATerm ) Close* Sp* ÷"; return true; } } diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java index 22182db08f8..c2c2f346c1b 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $ -* $Date: 2003/03/20 21:47:26 $ -* $Revision: 1.21 $ +* $Date: 2003/04/01 02:52:00 $ +* $Revision: 1.22 $ * ******************************************************************************* */ @@ -190,7 +190,7 @@ public interface UCD_Types { Math_Property = 10, Hex_Digit = 11, ASCII_Hex_Digit = 12, - Alphabetic = 13, + Other_Alphabetic = 13, Ideographic = 14, Diacritic = 15, Extender = 16, @@ -215,7 +215,7 @@ public interface UCD_Types { static final int BidiMirroredMask = 1<= 0; --i) { + if (sets[i].contains(codepoint)) return i; + } + return -1; + } + + public int getIndexFromLabel(String label) { + for (int i = count - 1; i >= 0; --i) { + if (labels[i].equalsIgnoreCase(label)) return i; + } + return -1; + } + + public String getLabel(int codepoint) { + return getLabelFromIndex(getIndex(codepoint)); + } + + public String getLabelFromIndex(int index) { + if (index < 0 || index >= count) return null; + return labels[index]; + } + + public UnicodeSet getSetFromIndex(int index) { + if (index < 0 || index >= count) return null; + return new UnicodeSet(sets[index]); // protect from changes + } + + public int size() { + return count; + } +} diff --git a/tools/unicodetools/com/ibm/text/utility/Utility.java b/tools/unicodetools/com/ibm/text/utility/Utility.java index 21d3b589d8a..e28afe26c39 100644 --- a/tools/unicodetools/com/ibm/text/utility/Utility.java +++ b/tools/unicodetools/com/ibm/text/utility/Utility.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $ -* $Date: 2003/03/19 17:30:56 $ -* $Revision: 1.30 $ +* $Date: 2003/04/01 02:52:00 $ +* $Revision: 1.31 $ * ******************************************************************************* */ @@ -1071,6 +1071,15 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES static PrintWriter showSetNamesPw; + public static void showSetDifferences(String name1, UnicodeSet set1, String name2, UnicodeSet set2, boolean separateLines, UCD ucd) { + UnicodeSet temp = new UnicodeSet(set1).removeAll(set2); + showSetNames("In " + name1 + ", but not " + name2, temp, separateLines, false, false, ucd); + temp = new UnicodeSet(set2).removeAll(set1); + showSetNames("In " + name2 + ", but not " + name1, temp, separateLines, false, false, ucd); + temp = new UnicodeSet(set2).retainAll(set1); + showSetNames("In " + name1 + " and " + name2, temp, separateLines, false, false, ucd); + } + public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, UCD ucd) { showSetNames(prefix, set, separateLines, false, false, ucd); }