diff --git a/tools/unicodetools/com/ibm/text/UCD/CheckCollator.java b/tools/unicodetools/com/ibm/text/UCD/CheckCollator.java index 521364791ee..ce18cf40ca9 100644 --- a/tools/unicodetools/com/ibm/text/UCD/CheckCollator.java +++ b/tools/unicodetools/com/ibm/text/UCD/CheckCollator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CheckCollator.java,v $ -* $Date: 2002/08/08 15:35:01 $ -* $Revision: 1.1 $ +* $Date: 2002/08/09 23:56:24 $ +* $Revision: 1.2 $ * ******************************************************************************* */ @@ -41,7 +41,7 @@ abstract public class CheckCollator { // later, drive off of args // choices are: Asian, Chinese, Japanese, Japanese_h, Japanese_k, Korean, Latin, Russian, Thai - test(Locale.KOREAN, "Korean"); + //test(Locale.KOREAN, "Korean"); test(Locale.ENGLISH, "Latin"); test(Locale.FRENCH, "Latin"); test(Locale.JAPANESE, "Japanese"); diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java index a7f1b65ed1b..d2187473638 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $ -* $Date: 2002/08/08 15:38:15 $ -* $Revision: 1.1 $ +* $Date: 2002/08/09 23:56:24 $ +* $Revision: 1.2 $ * ******************************************************************************* */ @@ -59,58 +59,53 @@ abstract public class GenerateBreakTest implements UCD_Types { return UTF16.charAt(source, start); } + // quick & dirty routine + String insertEverywhere(String source, String insertion, GenerateBreakTest breaker) { + String result = insertion; + for (int i = 0; i < source.length(); ++i) { + result += source.charAt(i); + if (breaker.isBreak(source, i, true)) { + result += insertion; + } + } + return result + insertion; + } + static UnicodeSet midLetterSet = new UnicodeSet("[\u0027\u002E\u003A\u00AD\u05F3\u05F4\u2019\uFE52\uFE55\uFF07\uFF0E\uFF1A]"); - /* - U+0027 APOSTROPHE - U+002E FULL STOP - U+003A COLON # used in Swedish - U+00AD SOFT HYPHEN - U+05F3 HEBREW PUNCTUATION GERESH - U+05F4 HEBREW PUNCTUATION GERSHAYIM - U+2019 RIGHT SINGLE QUOTATION MARK - U+FE52 SMALL FULL STOP - U+FE55 SMALL COLON - U+FF07 FULLWIDTH APOSTROPHE - U+FF0E FULLWIDTH FULL STOP - U+FF1A FULLWIDTH COLON - */ static UnicodeSet ambigSentPunct = new UnicodeSet("[\u002E\u0589\u06D4]"); - /* - U+002E FULL STOP - U+0589 ARMENIAN FULL STOP - U+06D4 ARABIC FULL STOP - */ static UnicodeSet sentPunct = new UnicodeSet("[\u0021\u003F\u0387\u061F\u0964\u203C\u203D\u2048\u2049" + "\u3002\ufe52\ufe57\uff01\uff0e\uff1f\uff61]"); - /* - U+0021 EXCLAMATION MARK - U+003F QUESTION MARK - U+0387 GREEK ANO TELEIA - U+061F ARABIC QUESTION MARK - U+0964 DEVANAGARI DANDA - U+203C DOUBLE EXCLAMATION MARK - U+203D INTERROBANG - U+2048 QUESTION EXCLAMATION MARK - U+2049 EXCLAMATION QUESTION MARK - U+3002 IDEOGRAPHIC FULL STOP - U+FE52 SMALL FULL STOP - U+FE57 SMALL EXCLAMATION MARK - U+FF01 FULLWIDTH EXCLAMATION MARK - U+FF0E FULLWIDTH FULL STOP - U+FF1F FULLWIDTH QUESTION MARK - U+FF61 HALFWIDTH IDEOGRAPHIC FULL STOP - */ + + static { + Default.setUCD(); + } + + static UnicodeSet extraAlpha = new UnicodeSet("[\\u02B9-\\u02BA\\u02C2-\\u02CF\\u02D2-\\u02DF\\u02E5\\u02ED\\u05F3]"); + static UnicodeSet alphabeticSet = UnifiedBinaryProperty.make(DERIVED | PropAlphabetic).getSet() + .addAll(extraAlpha); + + static UnicodeSet ideographicSet = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Ideographic).getSet(); + + static { + System.out.println("alphabetic: " + alphabeticSet.toPattern(true)); + } + + + // ====================== Main =========================== + + static final boolean SHOW_TYPE = false; + public static void main(String[] args) throws IOException { System.out.println("Remember to add length marks (half & full) and other punctuation for sentence, with FF61"); Default.setUCD(); - - checkDecomps(); - + if (DEBUG) { + checkDecomps(); + Utility.showSetNames("", new UnicodeSet("[\u034F\u00AD\u1806[:DI:]-[:Cs:]-[:Cn:]]"), true, Default.ucd); System.out.println("*** Extend - Cf"); @@ -122,7 +117,20 @@ abstract public class GenerateBreakTest implements UCD_Types { gwb.printLine(systemPrintWriter, "n\u0308't", true, true, false); systemPrintWriter.flush(); } + + if (false) { + GenerateSentenceBreakTest foo = new GenerateSentenceBreakTest(); + foo.isBreak("(\"Go.\") (He did)", 5, true); + + showSet("sepSet", GenerateSentenceBreakTest.sepSet); + showSet("atermSet", GenerateSentenceBreakTest.atermSet); + showSet("termSet", GenerateSentenceBreakTest.termSet); + } + new GenerateSentenceBreakTest().run(); + + //if (true) return; // cut short for now + new GenerateLineBreakTest().run(); new GenerateGraphemeBreakTest().run(); new GenerateWordBreakTest().run(); @@ -178,6 +186,13 @@ abstract public class GenerateBreakTest implements UCD_Types { return result.toString(); } + static void showSet(String title, UnicodeSet set) { + System.out.println(title + ": " + set.toPattern(true)); + Utility.showSetNames("", set, false, Default.ucd); + } + + + // determines if string is of form Base NSM* static boolean isBaseNSMStar(String source) { int cp; @@ -262,62 +277,74 @@ abstract public class GenerateBreakTest implements UCD_Types { PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS); out.println("