From 7ae37ad56d0a6248971b8566e206795d4eafe9a3 Mon Sep 17 00:00:00 2001 From: Alan Liu Date: Wed, 26 Apr 2000 17:13:17 +0000 Subject: [PATCH] Import Mark Davis' round trip test - currently fails! X-SVN-Rev: 1260 --- .../icu/dev/test/translit/RoundTripTest.java | 370 +++++++++++++++++ .../icu/dev/test/translit/TestUtility.java | 379 ++++++++++++++++++ .../com/ibm/test/translit/RoundTripTest.java | 370 +++++++++++++++++ .../com/ibm/test/translit/TestUtility.java | 379 ++++++++++++++++++ 4 files changed, 1498 insertions(+) create mode 100755 icu4j/src/com/ibm/icu/dev/test/translit/RoundTripTest.java create mode 100755 icu4j/src/com/ibm/icu/dev/test/translit/TestUtility.java create mode 100755 icu4j/src/com/ibm/test/translit/RoundTripTest.java create mode 100755 icu4j/src/com/ibm/test/translit/TestUtility.java diff --git a/icu4j/src/com/ibm/icu/dev/test/translit/RoundTripTest.java b/icu4j/src/com/ibm/icu/dev/test/translit/RoundTripTest.java new file mode 100755 index 00000000000..96734e7e6c8 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/test/translit/RoundTripTest.java @@ -0,0 +1,370 @@ +package com.ibm.test.translit; +import com.ibm.test.*; +import com.ibm.text.*; +import java.io.*; +import java.text.ParseException; + +/** + * @test + * @summary Round trip test of Transliterator + */ +public class RoundTripTest extends TestFmwk { + + public static void main(String[] args) throws Exception { + new RoundTripTest().run(args); + } + + public void TestRoundTrip() throws IOException, ParseException { + Test t; + + // Test Hiragana + new Test("Latin-Kana", + TestUtility.LATIN_SCRIPT, TestUtility.HIRAGANA_SCRIPT) + .test("[a-z]", "[\u3040-\u3094]", this); + + // Test Katakana + new Test("Latin-Kana", + TestUtility.LATIN_SCRIPT, TestUtility.KATAKANA_SCRIPT) + .test("[A-Z]", "[\u30A1-\u30FA]", this); + + // Test Arabic + new Test("Latin-Arabic", + TestUtility.LATIN_SCRIPT, TestUtility.ARABIC_SCRIPT) + .test(null, "[\u0620-\u065F-[\u0640]]", this); + + // Test Hebrew + new Test("Latin-Hebrew", + TestUtility.LATIN_SCRIPT, TestUtility.HEBREW_SCRIPT) + .test(null, "[\u05D0-\u05EF]", this); + + // Test Hangul + t = new TestHangul(); + t.setPairLimit(30); // Don't run full test -- too long + t.test(null, null, this); + + // Test Jamo + t = new Test("Latin-Jamo", + TestUtility.LATIN_SCRIPT, TestUtility.JAMO_SCRIPT); + t.setErrorLimit(100); + t.test(null, null, this); + + // Test JamoHangul + t = new Test("Latin-Jamo;Jamo-Hangul", + TestUtility.LATIN_SCRIPT, TestUtility.HANGUL_SCRIPT); + t.setErrorLimit(100); + t.test(null, null, this); + + // Test Greek + new Test("Latin-Greek", + TestUtility.LATIN_SCRIPT, TestUtility.GREEK_SCRIPT) + .test(null, "[\u0380-\u03CF]", this); + + // Test Cyrillic + new Test("Latin-Cyrillic", + TestUtility.LATIN_SCRIPT, TestUtility.CYRILLIC_SCRIPT) + .test(null, "[\u0401\u0410-\u0451]", this); + + // Test Utility + // TestUtility.test(); // dump blocks and scripts for debugging + } + + static class Test { + + PrintWriter out; + + private String transliteratorID; + private byte sourceScript; + private byte targetScript; + private boolean showProgress = true; + private boolean showSuccess = false; + private int errorLimit = Integer.MAX_VALUE; + private int errorCount = 0; + private int pairLimit = 0x10000; + UnicodeSet sourceRange; + UnicodeSet targetRange; + TestLog log; + + /* + * create a test for the given script transliterator. + */ + Test(String transliteratorID, + byte sourceScript, byte targetScript) { + this.transliteratorID = transliteratorID; + this.sourceScript = sourceScript; + this.targetScript = targetScript; + } + + public void setErrorLimit(int limit) { + errorLimit = limit; + } + + public void setPairLimit(int limit) { + pairLimit = limit; + } + + public void test(String sourceRange, String targetRange, TestLog log) + throws java.io.IOException, java.text.ParseException { + + if (sourceRange != null && sourceRange.length() > 0) { + this.sourceRange = new UnicodeSet(sourceRange); + } + if (targetRange != null && targetRange.length() > 0) { + this.targetRange = new UnicodeSet(targetRange); + } + + if (this.sourceRange == null) this.sourceRange = new UnicodeSet("[a-Z]"); + + this.log = log; + + // make a UTF-8 output file we can read with a browser + + // note: check that every transliterator transliterates the null string correctly! + + String logFileName = "test_" + transliteratorID + "_" + + sourceScript + "_" + targetScript + ".html"; + + log.logln("Creating log file " + logFileName); + + out = new PrintWriter( + new BufferedWriter( + new OutputStreamWriter( + new FileOutputStream(logFileName), + "UTF8"), + 4*1024)); + //out.write('\uFFEF'); // BOM + out.println(""); + out.println(""); + out.println(""); + out.println(""); + out.println(""); + test2(); + out.println("
"); + out.close(); + + if (errorCount > 0) { + log.errln(transliteratorID + " errors: " + errorCount); + } else { + log.logln(transliteratorID + " ok"); + new File(logFileName).delete(); + } + } + + public void test2() { + int count = 0; + + Transliterator sourceToTarget = Transliterator.getInstance(transliteratorID); + Transliterator targetToSource = sourceToTarget.getInverse(); + + log.logln("Checking that all source characters convert to target - Singles"); + + // check single letters + for (char c = 0; c < 0xFFFF; ++c) { + if (Character.getType(c) == Character.UNASSIGNED) continue; + if (!isSource(c)) continue; + //if (showProgress && (count++ % 100) == 0) { + // log.logln(count + ": " + TestUtility.hex(c)); + //} + String cs = String.valueOf(c); + String targ = sourceToTarget.transliterate(String.valueOf(cs)); + if (!isReceivingTarget(targ)) { + out.println("Fail Source-Target: " + cs + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")"); + if (++errorCount > errorLimit) return; + } else if (showSuccess) { + out.println(c + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")"); + } + } + + log.logln("Checking that all source characters convert to target - Doubles"); + count = 0; + + for (char c = 0; c < 0xFFFF; ++c) { + if (Character.getType(c) == Character.UNASSIGNED) continue; + if (!isSource(c)) continue; + for (char d = 0; d < 0xFFFF; ++d) { + if (Character.getType(d) == Character.UNASSIGNED) continue; + if (!isSource(d)) continue; + String cs = String.valueOf(c) + d; + //if (showProgress && (count++ % 1000) == 0) { + // log.logln(count + ": " + TestUtility.hex(cs)); + //} + String targ = sourceToTarget.transliterate(cs); + if (!isReceivingTarget(targ)) { + out.println("Fail Source-Target: " + cs + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")"); + if (++errorCount > errorLimit) return; + } else if (showSuccess) { + out.println(c + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")"); + } + } + } + + log.logln("Checking that target characters convert to source and back - Singles"); + count = 0; + + for (char c = 0; c < 0xFFFF; ++c) { + if (Character.getType(c) == Character.UNASSIGNED) continue; + if (!isTarget(c)) continue; + //if (showProgress && (count++ % 100) == 0) { + // log.logln(count + ": " + TestUtility.hex(c)); + //} + String cs = String.valueOf(c); + if (c > 0x0400) { + cs = cs + ""; + } + String targ = targetToSource.transliterate(cs); + String reverse = sourceToTarget.transliterate(targ); + if (!isReceivingSource(targ)) { + out.println("Fail Target-Source: " + cs + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")" ); + if (++errorCount > errorLimit) return; + } else if (!cs.equals(reverse)) { + out.println("Fail Roundtrip:" + + cs + "(" + + TestUtility.hex(cs) + ") =>" + + targ + "(" + + TestUtility.hex(targ) + ") =>" + + reverse + "(" + + TestUtility.hex(reverse) + ")" ); + if (++errorCount > errorLimit) return; + } else if (showSuccess) { + out.println(cs + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")" + + " => " + reverse + "(" + TestUtility.hex(reverse) + ")" ); + } + } + + log.logln("Checking that target characters convert to source and back - Doubles"); + count = 0; + + StringBuffer buf = new StringBuffer("aa"); + for (char c = 0; c < 0xFFFF; ++c) { + if (Character.getType(c) == Character.UNASSIGNED) continue; + if (!isTarget(c)) continue; + if (++count > pairLimit) { + out.println("Test truncated at " + pairLimit + " x 64k pairs"); + break; + } + buf.setCharAt(0, c); + if (showProgress) { // && (count++ % 10000) == 0) { + log.log(TestUtility.hex(c)); + // count + ": " + TestUtility.hex(cs)); + } + for (char d = 0; d < 0xFFFF; ++d) { + if (Character.getType(d) == Character.UNASSIGNED) continue; + if (!isTarget(d)) continue; + buf.setCharAt(1, d); + String cs = buf.toString(); + String targ = targetToSource.transliterate(cs); + String reverse = sourceToTarget.transliterate(targ); + if (!isReceivingSource(targ)) { + out.println("Fail Target-Source: " + cs + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")" ); + if (++errorCount > errorLimit) return; + } else if (!cs.equals(reverse)) { + out.println("Fail Roundtrip:" + + cs + "(" + + TestUtility.hex(cs) + ") =>" + + targ + "(" + + TestUtility.hex(targ) + ") =>" + + reverse + "(" + + TestUtility.hex(reverse) + ")" ); + if (++errorCount > errorLimit) return; + } else if (showSuccess) { + out.println(cs + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")" + + " => " + reverse + "(" + TestUtility.hex(reverse) + ")" ); + } + } + } + if (showProgress) log.logln(""); + } + + /* + * Characters to filter for source-target mapping completeness + * Typically is base alphabet, minus extended characters + * Default is ASCII letters for Latin + */ + public boolean isSource(char c) { + byte script = TestUtility.getScript(c); + if (script != sourceScript) return false; + if (!Character.isLetter(c)) return false; + if (!sourceRange.contains(c)) return false; + return true; + } + + /* + * Characters to check for target back to source mapping. + * Typically the same as the target script, plus punctuation + */ + public boolean isReceivingSource(char c) { + byte script = TestUtility.getScript(c); + return (script == sourceScript || script == TestUtility.COMMON_SCRIPT); + } + + /* + * Characters to filter for target-source mapping + * Typically is base alphabet, minus extended characters + */ + public boolean isTarget(char c) { + byte script = TestUtility.getScript(c); + if (script != targetScript) return false; + if (!Character.isLetter(c)) return false; + if (targetRange != null && !targetRange.contains(c)) return false; + return true; + } + + /* + * Characters to check for target-source mapping + * Typically the same as the source script, plus punctuation + */ + public boolean isReceivingTarget(char c) { + byte script = TestUtility.getScript(c); + return (script == targetScript || script == TestUtility.COMMON_SCRIPT); + } + + + final boolean isSource(String s) { + for (int i = 0; i < s.length(); ++i) { + if (!isSource(s.charAt(i))) return false; + } + return true; + } + + final boolean isTarget(String s) { + for (int i = 0; i < s.length(); ++i) { + if (!isTarget(s.charAt(i))) return false; + } + return true; + } + + final boolean isReceivingSource(String s) { + for (int i = 0; i < s.length(); ++i) { + if (!isReceivingSource(s.charAt(i))) return false; + } + return true; + } + + final boolean isReceivingTarget(String s) { + for (int i = 0; i < s.length(); ++i) { + if (!isReceivingTarget(s.charAt(i))) return false; + } + return true; + } + } + + static class TestHangul extends Test { + TestHangul () { + super("Jamo-Hangul", TestUtility.JAMO_SCRIPT, TestUtility.HANGUL_SCRIPT); + } + + public boolean isSource(char c) { + if (0x1113 <= c && c <= 0x1160) return false; + if (0x1176 <= c && c <= 0x11F9) return false; + if (0x3131 <= c && c <= 0x318E) return false; + return super.isSource(c); + } + } +} diff --git a/icu4j/src/com/ibm/icu/dev/test/translit/TestUtility.java b/icu4j/src/com/ibm/icu/dev/test/translit/TestUtility.java new file mode 100755 index 00000000000..97e5b5c53f4 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/test/translit/TestUtility.java @@ -0,0 +1,379 @@ +package com.ibm.test.translit; +public final class TestUtility { + + public static byte getScript(char c) { + return getScript(getBlock(c)); + } + + public static byte getScript(byte block) { + return blockToScript[block]; + } + + public static byte getBlock(char c) { + int index = c >> 7; + byte block = charToBlock[index]; + while (block < 0) { // take care of exceptions, blocks split across 128 boundaries + int[] tuple = split[-block-1]; + if (c < tuple[0]) block = (byte)tuple[1]; + else block = (byte)tuple[2]; + } + return block; + } + + // returns next letter of script, or 0xFFFF if done + + public static char getNextLetter(char c, byte script) { + while (c < 0xFFFF) { + ++c; + if (getScript(c) == script && Character.isLetter(c)) { + return c; + } + } + return c; + } + + public static String hex(char ch) { + String foo = Integer.toString(ch,16).toUpperCase(); + return "0000".substring(0,4-foo.length()) + foo; + } + + public static String hex(String s) { + return hex(s,","); + } + + public static String hex(String s, String sep) { + if (s.length() == 0) return ""; + String result = hex(s.charAt(0)); + for (int i = 1; i < s.length(); ++i) { + result += sep; + result += hex(s.charAt(i)); + } + return result; + } + + public static void test() { + System.out.println("Blocks: "); + byte lastblock = -128; + for (char cc = 0; cc < 0xFFFF; ++cc) { + byte block = TestUtility.getBlock(cc); + if (block != lastblock) { + System.out.println(TestUtility.hex(cc) + "\t" + block); + lastblock = block; + } + } + System.out.println(); + System.out.println("Scripts: "); + byte lastScript = -128; + for (char cc = 0; cc < 0xFFFF; ++cc) { + byte script = TestUtility.getScript(cc); + if (script != lastScript) { + System.out.println(TestUtility.hex(cc) + "\t" + script); + lastScript = script; + } + } + } + + + + public static final byte // SCRIPT CODE + COMMON_SCRIPT = 0, + LATIN_SCRIPT = 1, + GREEK_SCRIPT = 2, + CYRILLIC_SCRIPT = 3, + ARMENIAN_SCRIPT = 4, + HEBREW_SCRIPT = 5, + ARABIC_SCRIPT = 6, + SYRIAC_SCRIPT = 7, + THAANA_SCRIPT = 8, + DEVANAGARI_SCRIPT = 9, + BENGALI_SCRIPT = 10, + GURMUKHI_SCRIPT = 11, + GUJARATI_SCRIPT = 12, + ORIYA_SCRIPT = 13, + TAMIL_SCRIPT = 14, + TELUGU_SCRIPT = 15, + KANNADA_SCRIPT = 16, + MALAYALAM_SCRIPT = 17, + SINHALA_SCRIPT = 18, + THAI_SCRIPT = 19, + LAO_SCRIPT = 20, + TIBETAN_SCRIPT = 21, + MYANMAR_SCRIPT = 22, + GEORGIAN_SCRIPT = 23, + JAMO_SCRIPT = 24, + HANGUL_SCRIPT = 25, + ETHIOPIC_SCRIPT = 26, + CHEROKEE_SCRIPT = 27, + ABORIGINAL_SCRIPT = 28, + OGHAM_SCRIPT = 29, + RUNIC_SCRIPT = 30, + KHMER_SCRIPT = 31, + MONGOLIAN_SCRIPT = 32, + HIRAGANA_SCRIPT = 33, + KATAKANA_SCRIPT = 34, + BOPOMOFO_SCRIPT = 35, + HAN_SCRIPT = 36, + YI_SCRIPT = 37; + + public static final byte // block code + RESERVED_BLOCK = 0, + BASIC_LATIN = 1, + LATIN_1_SUPPLEMENT = 2, + LATIN_EXTENDED_A = 3, + LATIN_EXTENDED_B = 4, + IPA_EXTENSIONS = 5, + SPACING_MODIFIER_LETTERS = 6, + COMBINING_DIACRITICAL_MARKS = 7, + GREEK = 8, + CYRILLIC = 9, + ARMENIAN = 10, + HEBREW = 11, + ARABIC = 12, + SYRIAC = 13, + THAANA = 14, + DEVANAGARI = 15, + BENGALI = 16, + GURMUKHI = 17, + GUJARATI = 18, + ORIYA = 19, + TAMIL = 20, + TELUGU = 21, + KANNADA = 22, + MALAYALAM = 23, + SINHALA = 24, + THAI = 25, + LAO = 26, + TIBETAN = 27, + MYANMAR = 28, + GEORGIAN = 29, + HANGUL_JAMO = 30, + ETHIOPIC = 31, + CHEROKEE = 32, + UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33, + OGHAM = 34, + RUNIC = 35, + KHMER = 36, + MONGOLIAN = 37, + LATIN_EXTENDED_ADDITIONAL = 38, + GREEK_EXTENDED = 39, + GENERAL_PUNCTUATION = 40, + SUPERSCRIPTS_AND_SUBSCRIPTS = 41, + CURRENCY_SYMBOLS = 42, + COMBINING_MARKS_FOR_SYMBOLS = 43, + LETTERLIKE_SYMBOLS = 44, + NUMBER_FORMS = 45, + ARROWS = 46, + MATHEMATICAL_OPERATORS = 47, + MISCELLANEOUS_TECHNICAL = 48, + CONTROL_PICTURES = 49, + OPTICAL_CHARACTER_RECOGNITION = 50, + ENCLOSED_ALPHANUMERICS = 51, + BOX_DRAWING = 52, + BLOCK_ELEMENTS = 53, + GEOMETRIC_SHAPES = 54, + MISCELLANEOUS_SYMBOLS = 55, + DINGBATS = 56, + BRAILLE_PATTERNS = 57, + CJK_RADICALS_SUPPLEMENT = 58, + KANGXI_RADICALS = 59, + IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60, + CJK_SYMBOLS_AND_PUNCTUATION = 61, + HIRAGANA = 62, + KATAKANA = 63, + BOPOMOFO = 64, + HANGUL_COMPATIBILITY_JAMO = 65, + KANBUN = 66, + BOPOMOFO_EXTENDED = 67, + ENCLOSED_CJK_LETTERS_AND_MONTHS = 68, + CJK_COMPATIBILITY = 69, + CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70, + CJK_UNIFIED_IDEOGRAPHS = 71, + YI_SYLLABLES = 72, + YI_RADICALS = 73, + HANGUL_SYLLABLES = 74, + HIGH_SURROGATES = 75, + HIGH_PRIVATE_USE_SURROGATES = 76, + LOW_SURROGATES = 77, + PRIVATE_USE = 78, + CJK_COMPATIBILITY_IDEOGRAPHS = 79, + ALPHABETIC_PRESENTATION_FORMS = 80, + ARABIC_PRESENTATION_FORMS_A = 81, + COMBINING_HALF_MARKS = 82, + CJK_COMPATIBILITY_FORMS = 83, + SMALL_FORM_VARIANTS = 84, + ARABIC_PRESENTATION_FORMS_B = 85, + SPECIALS = 86, + HALFWIDTH_AND_FULLWIDTH_FORMS = 87; + + static final byte[] blockToScript = { + COMMON_SCRIPT, // 0, + LATIN_SCRIPT, // 1, BASIC_LATIN + LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT + LATIN_SCRIPT, // 3, LATIN_EXTENDED_A + LATIN_SCRIPT, // 4, LATIN_EXTENDED_B + LATIN_SCRIPT, // 5, IPA_EXTENSIONS + COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS + COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS + GREEK_SCRIPT, // 8, GREEK + CYRILLIC_SCRIPT, // 9, CYRILLIC + ARMENIAN_SCRIPT, // 10, ARMENIAN + HEBREW_SCRIPT, // 11, HEBREW + ARABIC_SCRIPT, // 12, ARABIC + SYRIAC_SCRIPT, // 13, SYRIAC + THAANA_SCRIPT, // 14, THAANA + DEVANAGARI_SCRIPT, // 15, DEVANAGARI + BENGALI_SCRIPT, // 16, BENGALI + GURMUKHI_SCRIPT, // 17, GURMUKHI + GUJARATI_SCRIPT, // 18, GUJARATI + ORIYA_SCRIPT, // 19, ORIYA + TAMIL_SCRIPT, // 20, TAMIL + TELUGU_SCRIPT, // 21, TELUGU + KANNADA_SCRIPT, // 22, KANNADA + MALAYALAM_SCRIPT, // 23, MALAYALAM + SINHALA_SCRIPT, // 24, SINHALA + THAI_SCRIPT, // 25, THAI + LAO_SCRIPT, // 26, LAO + TIBETAN_SCRIPT, // 27, TIBETAN + MYANMAR_SCRIPT, // 28, MYANMAR + GEORGIAN_SCRIPT, // 29, GEORGIAN + JAMO_SCRIPT, // 30, HANGUL_JAMO + ETHIOPIC_SCRIPT, // 31, ETHIOPIC + CHEROKEE_SCRIPT, // 32, CHEROKEE + ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS + OGHAM_SCRIPT, // 34, OGHAM + RUNIC_SCRIPT, // 35, RUNIC + KHMER_SCRIPT, // 36, KHMER + MONGOLIAN_SCRIPT, // 37, MONGOLIAN + LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL + GREEK_SCRIPT, // 39, GREEK_EXTENDED + COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION + COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS + COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS + COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS + COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS + COMMON_SCRIPT, // 45, NUMBER_FORMS + COMMON_SCRIPT, // 46, ARROWS + COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS + COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL + COMMON_SCRIPT, // 49, CONTROL_PICTURES + COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION + COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS + COMMON_SCRIPT, // 52, BOX_DRAWING + COMMON_SCRIPT, // 53, BLOCK_ELEMENTS + COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES + COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS + COMMON_SCRIPT, // 56, DINGBATS + COMMON_SCRIPT, // 57, BRAILLE_PATTERNS + HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT + HAN_SCRIPT, // 59, KANGXI_RADICALS + HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS + COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION + HIRAGANA_SCRIPT, // 62, HIRAGANA + KATAKANA_SCRIPT, // 63, KATAKANA + BOPOMOFO_SCRIPT, // 64, BOPOMOFO + JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO + HAN_SCRIPT, // 66, KANBUN + BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED + COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS + COMMON_SCRIPT, // 69, CJK_COMPATIBILITY + HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A + HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS + YI_SCRIPT, // 72, YI_SYLLABLES + YI_SCRIPT, // 73, YI_RADICALS + HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES + COMMON_SCRIPT, // 75, HIGH_SURROGATES + COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES + COMMON_SCRIPT, // 77, LOW_SURROGATES + COMMON_SCRIPT, // 78, PRIVATE_USE + HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS + COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS + ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A + COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS + COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS + COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS + ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B + COMMON_SCRIPT, // 86, SPECIALS + COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS + COMMON_SCRIPT, // 88, SPECIALS + }; + + // could be further reduced to a byte array, but I didn't bother. + static final int[][] split = { + {0x0250, 4, 5}, // -1 + {0x02B0, 5, 6}, // -2 + {0x0370, 7, 8}, // -3 + {0x0530, 0, 10}, // -4 + {0x0590, 10, 11}, // -5 + {0x0750, 13, 0}, // -6 + {0x07C0, 14, 0}, // -7 + {0x10A0, 28, 29}, // -8 + {0x13A0, 0, 32}, // -9 + {0x16A0, 34, 35}, // -10 + {0x18B0, 37, 0}, // -11 + {0x2070, 40, 41}, // -12 + {0x20A0, 41, -31}, // -13 + {0x2150, 44, 45}, // -14 + {0x2190, 45, 46}, // -15 + {0x2440, 49, -32}, // -16 + {0x25A0, 53, 54}, // -17 + {0x27C0, 56, 0}, // -18 + {0x2FE0, 59, -33}, // -19 + {0x3040, 61, 62}, // -20 + {0x30A0, 62, 63}, // -21 + {0x3130, 64, 65}, // -22 + {0x3190, 65, -34}, // -23 + {0x4DB6, 70, 0}, // -24 + {0xA490, 72, -35}, // -25 + {0xD7A4, 74, 0}, // -26 + {0xFB50, 80, 81}, // -27 + {0xFE20, 0, -36}, // -28 + {0xFEFF, 85, 86}, // -29 + {0xFFF0, 87, -37}, // -30 + {0x20D0, 42, 43}, // -31 + {0x2460, 50, 51}, // -32 + {0x2FF0, 0, 60}, // -33 + {0x31A0, 66, -38}, // -34 + {0xA4D0, 73, 0}, //-35 + {0xFE30, 82, -39}, //-36 + {0xFFFE, 88, 0}, //-37 + {0x31C0, 67, 0}, // -38 + {0xFE50, 83, -40}, //-39 + {0xFE70, 84, 85} // -40 + }; + + static final byte[] charToBlock = { + 1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7, + 0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27, + 28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36, + 37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39, + -12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18, + 57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19, + -20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26, + 75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30 + }; + +} diff --git a/icu4j/src/com/ibm/test/translit/RoundTripTest.java b/icu4j/src/com/ibm/test/translit/RoundTripTest.java new file mode 100755 index 00000000000..96734e7e6c8 --- /dev/null +++ b/icu4j/src/com/ibm/test/translit/RoundTripTest.java @@ -0,0 +1,370 @@ +package com.ibm.test.translit; +import com.ibm.test.*; +import com.ibm.text.*; +import java.io.*; +import java.text.ParseException; + +/** + * @test + * @summary Round trip test of Transliterator + */ +public class RoundTripTest extends TestFmwk { + + public static void main(String[] args) throws Exception { + new RoundTripTest().run(args); + } + + public void TestRoundTrip() throws IOException, ParseException { + Test t; + + // Test Hiragana + new Test("Latin-Kana", + TestUtility.LATIN_SCRIPT, TestUtility.HIRAGANA_SCRIPT) + .test("[a-z]", "[\u3040-\u3094]", this); + + // Test Katakana + new Test("Latin-Kana", + TestUtility.LATIN_SCRIPT, TestUtility.KATAKANA_SCRIPT) + .test("[A-Z]", "[\u30A1-\u30FA]", this); + + // Test Arabic + new Test("Latin-Arabic", + TestUtility.LATIN_SCRIPT, TestUtility.ARABIC_SCRIPT) + .test(null, "[\u0620-\u065F-[\u0640]]", this); + + // Test Hebrew + new Test("Latin-Hebrew", + TestUtility.LATIN_SCRIPT, TestUtility.HEBREW_SCRIPT) + .test(null, "[\u05D0-\u05EF]", this); + + // Test Hangul + t = new TestHangul(); + t.setPairLimit(30); // Don't run full test -- too long + t.test(null, null, this); + + // Test Jamo + t = new Test("Latin-Jamo", + TestUtility.LATIN_SCRIPT, TestUtility.JAMO_SCRIPT); + t.setErrorLimit(100); + t.test(null, null, this); + + // Test JamoHangul + t = new Test("Latin-Jamo;Jamo-Hangul", + TestUtility.LATIN_SCRIPT, TestUtility.HANGUL_SCRIPT); + t.setErrorLimit(100); + t.test(null, null, this); + + // Test Greek + new Test("Latin-Greek", + TestUtility.LATIN_SCRIPT, TestUtility.GREEK_SCRIPT) + .test(null, "[\u0380-\u03CF]", this); + + // Test Cyrillic + new Test("Latin-Cyrillic", + TestUtility.LATIN_SCRIPT, TestUtility.CYRILLIC_SCRIPT) + .test(null, "[\u0401\u0410-\u0451]", this); + + // Test Utility + // TestUtility.test(); // dump blocks and scripts for debugging + } + + static class Test { + + PrintWriter out; + + private String transliteratorID; + private byte sourceScript; + private byte targetScript; + private boolean showProgress = true; + private boolean showSuccess = false; + private int errorLimit = Integer.MAX_VALUE; + private int errorCount = 0; + private int pairLimit = 0x10000; + UnicodeSet sourceRange; + UnicodeSet targetRange; + TestLog log; + + /* + * create a test for the given script transliterator. + */ + Test(String transliteratorID, + byte sourceScript, byte targetScript) { + this.transliteratorID = transliteratorID; + this.sourceScript = sourceScript; + this.targetScript = targetScript; + } + + public void setErrorLimit(int limit) { + errorLimit = limit; + } + + public void setPairLimit(int limit) { + pairLimit = limit; + } + + public void test(String sourceRange, String targetRange, TestLog log) + throws java.io.IOException, java.text.ParseException { + + if (sourceRange != null && sourceRange.length() > 0) { + this.sourceRange = new UnicodeSet(sourceRange); + } + if (targetRange != null && targetRange.length() > 0) { + this.targetRange = new UnicodeSet(targetRange); + } + + if (this.sourceRange == null) this.sourceRange = new UnicodeSet("[a-Z]"); + + this.log = log; + + // make a UTF-8 output file we can read with a browser + + // note: check that every transliterator transliterates the null string correctly! + + String logFileName = "test_" + transliteratorID + "_" + + sourceScript + "_" + targetScript + ".html"; + + log.logln("Creating log file " + logFileName); + + out = new PrintWriter( + new BufferedWriter( + new OutputStreamWriter( + new FileOutputStream(logFileName), + "UTF8"), + 4*1024)); + //out.write('\uFFEF'); // BOM + out.println(""); + out.println(""); + out.println(""); + out.println(""); + out.println(""); + test2(); + out.println("
"); + out.close(); + + if (errorCount > 0) { + log.errln(transliteratorID + " errors: " + errorCount); + } else { + log.logln(transliteratorID + " ok"); + new File(logFileName).delete(); + } + } + + public void test2() { + int count = 0; + + Transliterator sourceToTarget = Transliterator.getInstance(transliteratorID); + Transliterator targetToSource = sourceToTarget.getInverse(); + + log.logln("Checking that all source characters convert to target - Singles"); + + // check single letters + for (char c = 0; c < 0xFFFF; ++c) { + if (Character.getType(c) == Character.UNASSIGNED) continue; + if (!isSource(c)) continue; + //if (showProgress && (count++ % 100) == 0) { + // log.logln(count + ": " + TestUtility.hex(c)); + //} + String cs = String.valueOf(c); + String targ = sourceToTarget.transliterate(String.valueOf(cs)); + if (!isReceivingTarget(targ)) { + out.println("Fail Source-Target: " + cs + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")"); + if (++errorCount > errorLimit) return; + } else if (showSuccess) { + out.println(c + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")"); + } + } + + log.logln("Checking that all source characters convert to target - Doubles"); + count = 0; + + for (char c = 0; c < 0xFFFF; ++c) { + if (Character.getType(c) == Character.UNASSIGNED) continue; + if (!isSource(c)) continue; + for (char d = 0; d < 0xFFFF; ++d) { + if (Character.getType(d) == Character.UNASSIGNED) continue; + if (!isSource(d)) continue; + String cs = String.valueOf(c) + d; + //if (showProgress && (count++ % 1000) == 0) { + // log.logln(count + ": " + TestUtility.hex(cs)); + //} + String targ = sourceToTarget.transliterate(cs); + if (!isReceivingTarget(targ)) { + out.println("Fail Source-Target: " + cs + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")"); + if (++errorCount > errorLimit) return; + } else if (showSuccess) { + out.println(c + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")"); + } + } + } + + log.logln("Checking that target characters convert to source and back - Singles"); + count = 0; + + for (char c = 0; c < 0xFFFF; ++c) { + if (Character.getType(c) == Character.UNASSIGNED) continue; + if (!isTarget(c)) continue; + //if (showProgress && (count++ % 100) == 0) { + // log.logln(count + ": " + TestUtility.hex(c)); + //} + String cs = String.valueOf(c); + if (c > 0x0400) { + cs = cs + ""; + } + String targ = targetToSource.transliterate(cs); + String reverse = sourceToTarget.transliterate(targ); + if (!isReceivingSource(targ)) { + out.println("Fail Target-Source: " + cs + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")" ); + if (++errorCount > errorLimit) return; + } else if (!cs.equals(reverse)) { + out.println("Fail Roundtrip:" + + cs + "(" + + TestUtility.hex(cs) + ") =>" + + targ + "(" + + TestUtility.hex(targ) + ") =>" + + reverse + "(" + + TestUtility.hex(reverse) + ")" ); + if (++errorCount > errorLimit) return; + } else if (showSuccess) { + out.println(cs + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")" + + " => " + reverse + "(" + TestUtility.hex(reverse) + ")" ); + } + } + + log.logln("Checking that target characters convert to source and back - Doubles"); + count = 0; + + StringBuffer buf = new StringBuffer("aa"); + for (char c = 0; c < 0xFFFF; ++c) { + if (Character.getType(c) == Character.UNASSIGNED) continue; + if (!isTarget(c)) continue; + if (++count > pairLimit) { + out.println("Test truncated at " + pairLimit + " x 64k pairs"); + break; + } + buf.setCharAt(0, c); + if (showProgress) { // && (count++ % 10000) == 0) { + log.log(TestUtility.hex(c)); + // count + ": " + TestUtility.hex(cs)); + } + for (char d = 0; d < 0xFFFF; ++d) { + if (Character.getType(d) == Character.UNASSIGNED) continue; + if (!isTarget(d)) continue; + buf.setCharAt(1, d); + String cs = buf.toString(); + String targ = targetToSource.transliterate(cs); + String reverse = sourceToTarget.transliterate(targ); + if (!isReceivingSource(targ)) { + out.println("Fail Target-Source: " + cs + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")" ); + if (++errorCount > errorLimit) return; + } else if (!cs.equals(reverse)) { + out.println("Fail Roundtrip:" + + cs + "(" + + TestUtility.hex(cs) + ") =>" + + targ + "(" + + TestUtility.hex(targ) + ") =>" + + reverse + "(" + + TestUtility.hex(reverse) + ")" ); + if (++errorCount > errorLimit) return; + } else if (showSuccess) { + out.println(cs + "(" + TestUtility.hex(cs) + ")" + + " => " + targ + "(" + TestUtility.hex(targ) + ")" + + " => " + reverse + "(" + TestUtility.hex(reverse) + ")" ); + } + } + } + if (showProgress) log.logln(""); + } + + /* + * Characters to filter for source-target mapping completeness + * Typically is base alphabet, minus extended characters + * Default is ASCII letters for Latin + */ + public boolean isSource(char c) { + byte script = TestUtility.getScript(c); + if (script != sourceScript) return false; + if (!Character.isLetter(c)) return false; + if (!sourceRange.contains(c)) return false; + return true; + } + + /* + * Characters to check for target back to source mapping. + * Typically the same as the target script, plus punctuation + */ + public boolean isReceivingSource(char c) { + byte script = TestUtility.getScript(c); + return (script == sourceScript || script == TestUtility.COMMON_SCRIPT); + } + + /* + * Characters to filter for target-source mapping + * Typically is base alphabet, minus extended characters + */ + public boolean isTarget(char c) { + byte script = TestUtility.getScript(c); + if (script != targetScript) return false; + if (!Character.isLetter(c)) return false; + if (targetRange != null && !targetRange.contains(c)) return false; + return true; + } + + /* + * Characters to check for target-source mapping + * Typically the same as the source script, plus punctuation + */ + public boolean isReceivingTarget(char c) { + byte script = TestUtility.getScript(c); + return (script == targetScript || script == TestUtility.COMMON_SCRIPT); + } + + + final boolean isSource(String s) { + for (int i = 0; i < s.length(); ++i) { + if (!isSource(s.charAt(i))) return false; + } + return true; + } + + final boolean isTarget(String s) { + for (int i = 0; i < s.length(); ++i) { + if (!isTarget(s.charAt(i))) return false; + } + return true; + } + + final boolean isReceivingSource(String s) { + for (int i = 0; i < s.length(); ++i) { + if (!isReceivingSource(s.charAt(i))) return false; + } + return true; + } + + final boolean isReceivingTarget(String s) { + for (int i = 0; i < s.length(); ++i) { + if (!isReceivingTarget(s.charAt(i))) return false; + } + return true; + } + } + + static class TestHangul extends Test { + TestHangul () { + super("Jamo-Hangul", TestUtility.JAMO_SCRIPT, TestUtility.HANGUL_SCRIPT); + } + + public boolean isSource(char c) { + if (0x1113 <= c && c <= 0x1160) return false; + if (0x1176 <= c && c <= 0x11F9) return false; + if (0x3131 <= c && c <= 0x318E) return false; + return super.isSource(c); + } + } +} diff --git a/icu4j/src/com/ibm/test/translit/TestUtility.java b/icu4j/src/com/ibm/test/translit/TestUtility.java new file mode 100755 index 00000000000..97e5b5c53f4 --- /dev/null +++ b/icu4j/src/com/ibm/test/translit/TestUtility.java @@ -0,0 +1,379 @@ +package com.ibm.test.translit; +public final class TestUtility { + + public static byte getScript(char c) { + return getScript(getBlock(c)); + } + + public static byte getScript(byte block) { + return blockToScript[block]; + } + + public static byte getBlock(char c) { + int index = c >> 7; + byte block = charToBlock[index]; + while (block < 0) { // take care of exceptions, blocks split across 128 boundaries + int[] tuple = split[-block-1]; + if (c < tuple[0]) block = (byte)tuple[1]; + else block = (byte)tuple[2]; + } + return block; + } + + // returns next letter of script, or 0xFFFF if done + + public static char getNextLetter(char c, byte script) { + while (c < 0xFFFF) { + ++c; + if (getScript(c) == script && Character.isLetter(c)) { + return c; + } + } + return c; + } + + public static String hex(char ch) { + String foo = Integer.toString(ch,16).toUpperCase(); + return "0000".substring(0,4-foo.length()) + foo; + } + + public static String hex(String s) { + return hex(s,","); + } + + public static String hex(String s, String sep) { + if (s.length() == 0) return ""; + String result = hex(s.charAt(0)); + for (int i = 1; i < s.length(); ++i) { + result += sep; + result += hex(s.charAt(i)); + } + return result; + } + + public static void test() { + System.out.println("Blocks: "); + byte lastblock = -128; + for (char cc = 0; cc < 0xFFFF; ++cc) { + byte block = TestUtility.getBlock(cc); + if (block != lastblock) { + System.out.println(TestUtility.hex(cc) + "\t" + block); + lastblock = block; + } + } + System.out.println(); + System.out.println("Scripts: "); + byte lastScript = -128; + for (char cc = 0; cc < 0xFFFF; ++cc) { + byte script = TestUtility.getScript(cc); + if (script != lastScript) { + System.out.println(TestUtility.hex(cc) + "\t" + script); + lastScript = script; + } + } + } + + + + public static final byte // SCRIPT CODE + COMMON_SCRIPT = 0, + LATIN_SCRIPT = 1, + GREEK_SCRIPT = 2, + CYRILLIC_SCRIPT = 3, + ARMENIAN_SCRIPT = 4, + HEBREW_SCRIPT = 5, + ARABIC_SCRIPT = 6, + SYRIAC_SCRIPT = 7, + THAANA_SCRIPT = 8, + DEVANAGARI_SCRIPT = 9, + BENGALI_SCRIPT = 10, + GURMUKHI_SCRIPT = 11, + GUJARATI_SCRIPT = 12, + ORIYA_SCRIPT = 13, + TAMIL_SCRIPT = 14, + TELUGU_SCRIPT = 15, + KANNADA_SCRIPT = 16, + MALAYALAM_SCRIPT = 17, + SINHALA_SCRIPT = 18, + THAI_SCRIPT = 19, + LAO_SCRIPT = 20, + TIBETAN_SCRIPT = 21, + MYANMAR_SCRIPT = 22, + GEORGIAN_SCRIPT = 23, + JAMO_SCRIPT = 24, + HANGUL_SCRIPT = 25, + ETHIOPIC_SCRIPT = 26, + CHEROKEE_SCRIPT = 27, + ABORIGINAL_SCRIPT = 28, + OGHAM_SCRIPT = 29, + RUNIC_SCRIPT = 30, + KHMER_SCRIPT = 31, + MONGOLIAN_SCRIPT = 32, + HIRAGANA_SCRIPT = 33, + KATAKANA_SCRIPT = 34, + BOPOMOFO_SCRIPT = 35, + HAN_SCRIPT = 36, + YI_SCRIPT = 37; + + public static final byte // block code + RESERVED_BLOCK = 0, + BASIC_LATIN = 1, + LATIN_1_SUPPLEMENT = 2, + LATIN_EXTENDED_A = 3, + LATIN_EXTENDED_B = 4, + IPA_EXTENSIONS = 5, + SPACING_MODIFIER_LETTERS = 6, + COMBINING_DIACRITICAL_MARKS = 7, + GREEK = 8, + CYRILLIC = 9, + ARMENIAN = 10, + HEBREW = 11, + ARABIC = 12, + SYRIAC = 13, + THAANA = 14, + DEVANAGARI = 15, + BENGALI = 16, + GURMUKHI = 17, + GUJARATI = 18, + ORIYA = 19, + TAMIL = 20, + TELUGU = 21, + KANNADA = 22, + MALAYALAM = 23, + SINHALA = 24, + THAI = 25, + LAO = 26, + TIBETAN = 27, + MYANMAR = 28, + GEORGIAN = 29, + HANGUL_JAMO = 30, + ETHIOPIC = 31, + CHEROKEE = 32, + UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33, + OGHAM = 34, + RUNIC = 35, + KHMER = 36, + MONGOLIAN = 37, + LATIN_EXTENDED_ADDITIONAL = 38, + GREEK_EXTENDED = 39, + GENERAL_PUNCTUATION = 40, + SUPERSCRIPTS_AND_SUBSCRIPTS = 41, + CURRENCY_SYMBOLS = 42, + COMBINING_MARKS_FOR_SYMBOLS = 43, + LETTERLIKE_SYMBOLS = 44, + NUMBER_FORMS = 45, + ARROWS = 46, + MATHEMATICAL_OPERATORS = 47, + MISCELLANEOUS_TECHNICAL = 48, + CONTROL_PICTURES = 49, + OPTICAL_CHARACTER_RECOGNITION = 50, + ENCLOSED_ALPHANUMERICS = 51, + BOX_DRAWING = 52, + BLOCK_ELEMENTS = 53, + GEOMETRIC_SHAPES = 54, + MISCELLANEOUS_SYMBOLS = 55, + DINGBATS = 56, + BRAILLE_PATTERNS = 57, + CJK_RADICALS_SUPPLEMENT = 58, + KANGXI_RADICALS = 59, + IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60, + CJK_SYMBOLS_AND_PUNCTUATION = 61, + HIRAGANA = 62, + KATAKANA = 63, + BOPOMOFO = 64, + HANGUL_COMPATIBILITY_JAMO = 65, + KANBUN = 66, + BOPOMOFO_EXTENDED = 67, + ENCLOSED_CJK_LETTERS_AND_MONTHS = 68, + CJK_COMPATIBILITY = 69, + CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70, + CJK_UNIFIED_IDEOGRAPHS = 71, + YI_SYLLABLES = 72, + YI_RADICALS = 73, + HANGUL_SYLLABLES = 74, + HIGH_SURROGATES = 75, + HIGH_PRIVATE_USE_SURROGATES = 76, + LOW_SURROGATES = 77, + PRIVATE_USE = 78, + CJK_COMPATIBILITY_IDEOGRAPHS = 79, + ALPHABETIC_PRESENTATION_FORMS = 80, + ARABIC_PRESENTATION_FORMS_A = 81, + COMBINING_HALF_MARKS = 82, + CJK_COMPATIBILITY_FORMS = 83, + SMALL_FORM_VARIANTS = 84, + ARABIC_PRESENTATION_FORMS_B = 85, + SPECIALS = 86, + HALFWIDTH_AND_FULLWIDTH_FORMS = 87; + + static final byte[] blockToScript = { + COMMON_SCRIPT, // 0, + LATIN_SCRIPT, // 1, BASIC_LATIN + LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT + LATIN_SCRIPT, // 3, LATIN_EXTENDED_A + LATIN_SCRIPT, // 4, LATIN_EXTENDED_B + LATIN_SCRIPT, // 5, IPA_EXTENSIONS + COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS + COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS + GREEK_SCRIPT, // 8, GREEK + CYRILLIC_SCRIPT, // 9, CYRILLIC + ARMENIAN_SCRIPT, // 10, ARMENIAN + HEBREW_SCRIPT, // 11, HEBREW + ARABIC_SCRIPT, // 12, ARABIC + SYRIAC_SCRIPT, // 13, SYRIAC + THAANA_SCRIPT, // 14, THAANA + DEVANAGARI_SCRIPT, // 15, DEVANAGARI + BENGALI_SCRIPT, // 16, BENGALI + GURMUKHI_SCRIPT, // 17, GURMUKHI + GUJARATI_SCRIPT, // 18, GUJARATI + ORIYA_SCRIPT, // 19, ORIYA + TAMIL_SCRIPT, // 20, TAMIL + TELUGU_SCRIPT, // 21, TELUGU + KANNADA_SCRIPT, // 22, KANNADA + MALAYALAM_SCRIPT, // 23, MALAYALAM + SINHALA_SCRIPT, // 24, SINHALA + THAI_SCRIPT, // 25, THAI + LAO_SCRIPT, // 26, LAO + TIBETAN_SCRIPT, // 27, TIBETAN + MYANMAR_SCRIPT, // 28, MYANMAR + GEORGIAN_SCRIPT, // 29, GEORGIAN + JAMO_SCRIPT, // 30, HANGUL_JAMO + ETHIOPIC_SCRIPT, // 31, ETHIOPIC + CHEROKEE_SCRIPT, // 32, CHEROKEE + ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS + OGHAM_SCRIPT, // 34, OGHAM + RUNIC_SCRIPT, // 35, RUNIC + KHMER_SCRIPT, // 36, KHMER + MONGOLIAN_SCRIPT, // 37, MONGOLIAN + LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL + GREEK_SCRIPT, // 39, GREEK_EXTENDED + COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION + COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS + COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS + COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS + COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS + COMMON_SCRIPT, // 45, NUMBER_FORMS + COMMON_SCRIPT, // 46, ARROWS + COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS + COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL + COMMON_SCRIPT, // 49, CONTROL_PICTURES + COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION + COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS + COMMON_SCRIPT, // 52, BOX_DRAWING + COMMON_SCRIPT, // 53, BLOCK_ELEMENTS + COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES + COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS + COMMON_SCRIPT, // 56, DINGBATS + COMMON_SCRIPT, // 57, BRAILLE_PATTERNS + HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT + HAN_SCRIPT, // 59, KANGXI_RADICALS + HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS + COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION + HIRAGANA_SCRIPT, // 62, HIRAGANA + KATAKANA_SCRIPT, // 63, KATAKANA + BOPOMOFO_SCRIPT, // 64, BOPOMOFO + JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO + HAN_SCRIPT, // 66, KANBUN + BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED + COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS + COMMON_SCRIPT, // 69, CJK_COMPATIBILITY + HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A + HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS + YI_SCRIPT, // 72, YI_SYLLABLES + YI_SCRIPT, // 73, YI_RADICALS + HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES + COMMON_SCRIPT, // 75, HIGH_SURROGATES + COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES + COMMON_SCRIPT, // 77, LOW_SURROGATES + COMMON_SCRIPT, // 78, PRIVATE_USE + HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS + COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS + ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A + COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS + COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS + COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS + ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B + COMMON_SCRIPT, // 86, SPECIALS + COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS + COMMON_SCRIPT, // 88, SPECIALS + }; + + // could be further reduced to a byte array, but I didn't bother. + static final int[][] split = { + {0x0250, 4, 5}, // -1 + {0x02B0, 5, 6}, // -2 + {0x0370, 7, 8}, // -3 + {0x0530, 0, 10}, // -4 + {0x0590, 10, 11}, // -5 + {0x0750, 13, 0}, // -6 + {0x07C0, 14, 0}, // -7 + {0x10A0, 28, 29}, // -8 + {0x13A0, 0, 32}, // -9 + {0x16A0, 34, 35}, // -10 + {0x18B0, 37, 0}, // -11 + {0x2070, 40, 41}, // -12 + {0x20A0, 41, -31}, // -13 + {0x2150, 44, 45}, // -14 + {0x2190, 45, 46}, // -15 + {0x2440, 49, -32}, // -16 + {0x25A0, 53, 54}, // -17 + {0x27C0, 56, 0}, // -18 + {0x2FE0, 59, -33}, // -19 + {0x3040, 61, 62}, // -20 + {0x30A0, 62, 63}, // -21 + {0x3130, 64, 65}, // -22 + {0x3190, 65, -34}, // -23 + {0x4DB6, 70, 0}, // -24 + {0xA490, 72, -35}, // -25 + {0xD7A4, 74, 0}, // -26 + {0xFB50, 80, 81}, // -27 + {0xFE20, 0, -36}, // -28 + {0xFEFF, 85, 86}, // -29 + {0xFFF0, 87, -37}, // -30 + {0x20D0, 42, 43}, // -31 + {0x2460, 50, 51}, // -32 + {0x2FF0, 0, 60}, // -33 + {0x31A0, 66, -38}, // -34 + {0xA4D0, 73, 0}, //-35 + {0xFE30, 82, -39}, //-36 + {0xFFFE, 88, 0}, //-37 + {0x31C0, 67, 0}, // -38 + {0xFE50, 83, -40}, //-39 + {0xFE70, 84, 85} // -40 + }; + + static final byte[] charToBlock = { + 1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7, + 0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27, + 28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36, + 37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39, + -12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18, + 57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19, + -20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26, + 75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30 + }; + +}