mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
Import Mark Davis' round trip test - currently fails!
X-SVN-Rev: 1260
This commit is contained in:
parent
18003aee55
commit
7ae37ad56d
4 changed files with 1498 additions and 0 deletions
370
icu4j/src/com/ibm/icu/dev/test/translit/RoundTripTest.java
Executable file
370
icu4j/src/com/ibm/icu/dev/test/translit/RoundTripTest.java
Executable file
|
@ -0,0 +1,370 @@
|
|||
package com.ibm.test.translit;
|
||||
import com.ibm.test.*;
|
||||
import com.ibm.text.*;
|
||||
import java.io.*;
|
||||
import java.text.ParseException;
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @summary Round trip test of Transliterator
|
||||
*/
|
||||
public class RoundTripTest extends TestFmwk {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
new RoundTripTest().run(args);
|
||||
}
|
||||
|
||||
public void TestRoundTrip() throws IOException, ParseException {
|
||||
Test t;
|
||||
|
||||
// Test Hiragana
|
||||
new Test("Latin-Kana",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.HIRAGANA_SCRIPT)
|
||||
.test("[a-z]", "[\u3040-\u3094]", this);
|
||||
|
||||
// Test Katakana
|
||||
new Test("Latin-Kana",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.KATAKANA_SCRIPT)
|
||||
.test("[A-Z]", "[\u30A1-\u30FA]", this);
|
||||
|
||||
// Test Arabic
|
||||
new Test("Latin-Arabic",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.ARABIC_SCRIPT)
|
||||
.test(null, "[\u0620-\u065F-[\u0640]]", this);
|
||||
|
||||
// Test Hebrew
|
||||
new Test("Latin-Hebrew",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.HEBREW_SCRIPT)
|
||||
.test(null, "[\u05D0-\u05EF]", this);
|
||||
|
||||
// Test Hangul
|
||||
t = new TestHangul();
|
||||
t.setPairLimit(30); // Don't run full test -- too long
|
||||
t.test(null, null, this);
|
||||
|
||||
// Test Jamo
|
||||
t = new Test("Latin-Jamo",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.JAMO_SCRIPT);
|
||||
t.setErrorLimit(100);
|
||||
t.test(null, null, this);
|
||||
|
||||
// Test JamoHangul
|
||||
t = new Test("Latin-Jamo;Jamo-Hangul",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.HANGUL_SCRIPT);
|
||||
t.setErrorLimit(100);
|
||||
t.test(null, null, this);
|
||||
|
||||
// Test Greek
|
||||
new Test("Latin-Greek",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.GREEK_SCRIPT)
|
||||
.test(null, "[\u0380-\u03CF]", this);
|
||||
|
||||
// Test Cyrillic
|
||||
new Test("Latin-Cyrillic",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.CYRILLIC_SCRIPT)
|
||||
.test(null, "[\u0401\u0410-\u0451]", this);
|
||||
|
||||
// Test Utility
|
||||
// TestUtility.test(); // dump blocks and scripts for debugging
|
||||
}
|
||||
|
||||
static class Test {
|
||||
|
||||
PrintWriter out;
|
||||
|
||||
private String transliteratorID;
|
||||
private byte sourceScript;
|
||||
private byte targetScript;
|
||||
private boolean showProgress = true;
|
||||
private boolean showSuccess = false;
|
||||
private int errorLimit = Integer.MAX_VALUE;
|
||||
private int errorCount = 0;
|
||||
private int pairLimit = 0x10000;
|
||||
UnicodeSet sourceRange;
|
||||
UnicodeSet targetRange;
|
||||
TestLog log;
|
||||
|
||||
/*
|
||||
* create a test for the given script transliterator.
|
||||
*/
|
||||
Test(String transliteratorID,
|
||||
byte sourceScript, byte targetScript) {
|
||||
this.transliteratorID = transliteratorID;
|
||||
this.sourceScript = sourceScript;
|
||||
this.targetScript = targetScript;
|
||||
}
|
||||
|
||||
public void setErrorLimit(int limit) {
|
||||
errorLimit = limit;
|
||||
}
|
||||
|
||||
public void setPairLimit(int limit) {
|
||||
pairLimit = limit;
|
||||
}
|
||||
|
||||
public void test(String sourceRange, String targetRange, TestLog log)
|
||||
throws java.io.IOException, java.text.ParseException {
|
||||
|
||||
if (sourceRange != null && sourceRange.length() > 0) {
|
||||
this.sourceRange = new UnicodeSet(sourceRange);
|
||||
}
|
||||
if (targetRange != null && targetRange.length() > 0) {
|
||||
this.targetRange = new UnicodeSet(targetRange);
|
||||
}
|
||||
|
||||
if (this.sourceRange == null) this.sourceRange = new UnicodeSet("[a-Z]");
|
||||
|
||||
this.log = log;
|
||||
|
||||
// make a UTF-8 output file we can read with a browser
|
||||
|
||||
// note: check that every transliterator transliterates the null string correctly!
|
||||
|
||||
String logFileName = "test_" + transliteratorID + "_"
|
||||
+ sourceScript + "_" + targetScript + ".html";
|
||||
|
||||
log.logln("Creating log file " + logFileName);
|
||||
|
||||
out = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(logFileName),
|
||||
"UTF8"),
|
||||
4*1024));
|
||||
//out.write('\uFFEF'); // BOM
|
||||
out.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">");
|
||||
out.println("<HTML><HEAD>");
|
||||
out.println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>");
|
||||
out.println("<BODY>");
|
||||
out.println("<TABLE>");
|
||||
test2();
|
||||
out.println("</TABLE></BODY></HTML>");
|
||||
out.close();
|
||||
|
||||
if (errorCount > 0) {
|
||||
log.errln(transliteratorID + " errors: " + errorCount);
|
||||
} else {
|
||||
log.logln(transliteratorID + " ok");
|
||||
new File(logFileName).delete();
|
||||
}
|
||||
}
|
||||
|
||||
public void test2() {
|
||||
int count = 0;
|
||||
|
||||
Transliterator sourceToTarget = Transliterator.getInstance(transliteratorID);
|
||||
Transliterator targetToSource = sourceToTarget.getInverse();
|
||||
|
||||
log.logln("Checking that all source characters convert to target - Singles");
|
||||
|
||||
// check single letters
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if (Character.getType(c) == Character.UNASSIGNED) continue;
|
||||
if (!isSource(c)) continue;
|
||||
//if (showProgress && (count++ % 100) == 0) {
|
||||
// log.logln(count + ": " + TestUtility.hex(c));
|
||||
//}
|
||||
String cs = String.valueOf(c);
|
||||
String targ = sourceToTarget.transliterate(String.valueOf(cs));
|
||||
if (!isReceivingTarget(targ)) {
|
||||
out.println("<TR><TD>Fail Source-Target: " + cs + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")</TD></TR>");
|
||||
if (++errorCount > errorLimit) return;
|
||||
} else if (showSuccess) {
|
||||
out.println(c + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")");
|
||||
}
|
||||
}
|
||||
|
||||
log.logln("Checking that all source characters convert to target - Doubles");
|
||||
count = 0;
|
||||
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if (Character.getType(c) == Character.UNASSIGNED) continue;
|
||||
if (!isSource(c)) continue;
|
||||
for (char d = 0; d < 0xFFFF; ++d) {
|
||||
if (Character.getType(d) == Character.UNASSIGNED) continue;
|
||||
if (!isSource(d)) continue;
|
||||
String cs = String.valueOf(c) + d;
|
||||
//if (showProgress && (count++ % 1000) == 0) {
|
||||
// log.logln(count + ": " + TestUtility.hex(cs));
|
||||
//}
|
||||
String targ = sourceToTarget.transliterate(cs);
|
||||
if (!isReceivingTarget(targ)) {
|
||||
out.println("<TR><TD>Fail Source-Target: " + cs + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")</TR></TD>");
|
||||
if (++errorCount > errorLimit) return;
|
||||
} else if (showSuccess) {
|
||||
out.println(c + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.logln("Checking that target characters convert to source and back - Singles");
|
||||
count = 0;
|
||||
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if (Character.getType(c) == Character.UNASSIGNED) continue;
|
||||
if (!isTarget(c)) continue;
|
||||
//if (showProgress && (count++ % 100) == 0) {
|
||||
// log.logln(count + ": " + TestUtility.hex(c));
|
||||
//}
|
||||
String cs = String.valueOf(c);
|
||||
if (c > 0x0400) {
|
||||
cs = cs + "";
|
||||
}
|
||||
String targ = targetToSource.transliterate(cs);
|
||||
String reverse = sourceToTarget.transliterate(targ);
|
||||
if (!isReceivingSource(targ)) {
|
||||
out.println("<TR><TD>Fail Target-Source: " + cs + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")</TR></TD>" );
|
||||
if (++errorCount > errorLimit) return;
|
||||
} else if (!cs.equals(reverse)) {
|
||||
out.println("<TR><TD>Fail Roundtrip:</TD><TD><FONT SIZE=\"6\">" +
|
||||
cs + "</FONT></TD><TD>(" +
|
||||
TestUtility.hex(cs) + ") =></TD><TD>" +
|
||||
targ + "</TD><TD>(" +
|
||||
TestUtility.hex(targ) + ") =></TD><TD><FONT SIZE=\"6\">" +
|
||||
reverse + "</TD><TD>(" +
|
||||
TestUtility.hex(reverse) + ")</TD></TR>" );
|
||||
if (++errorCount > errorLimit) return;
|
||||
} else if (showSuccess) {
|
||||
out.println(cs + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")"
|
||||
+ " => " + reverse + "(" + TestUtility.hex(reverse) + ")" );
|
||||
}
|
||||
}
|
||||
|
||||
log.logln("Checking that target characters convert to source and back - Doubles");
|
||||
count = 0;
|
||||
|
||||
StringBuffer buf = new StringBuffer("aa");
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if (Character.getType(c) == Character.UNASSIGNED) continue;
|
||||
if (!isTarget(c)) continue;
|
||||
if (++count > pairLimit) {
|
||||
out.println("<TR><TD>Test truncated at " + pairLimit + " x 64k pairs</TR></TD>");
|
||||
break;
|
||||
}
|
||||
buf.setCharAt(0, c);
|
||||
if (showProgress) { // && (count++ % 10000) == 0) {
|
||||
log.log(TestUtility.hex(c));
|
||||
// count + ": " + TestUtility.hex(cs));
|
||||
}
|
||||
for (char d = 0; d < 0xFFFF; ++d) {
|
||||
if (Character.getType(d) == Character.UNASSIGNED) continue;
|
||||
if (!isTarget(d)) continue;
|
||||
buf.setCharAt(1, d);
|
||||
String cs = buf.toString();
|
||||
String targ = targetToSource.transliterate(cs);
|
||||
String reverse = sourceToTarget.transliterate(targ);
|
||||
if (!isReceivingSource(targ)) {
|
||||
out.println("<TR><TD>Fail Target-Source: " + cs + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")</TR></TD>" );
|
||||
if (++errorCount > errorLimit) return;
|
||||
} else if (!cs.equals(reverse)) {
|
||||
out.println("<TR><TD>Fail Roundtrip:</TD><TD><FONT SIZE=\"6\">" +
|
||||
cs + "</FONT></TD><TD>(" +
|
||||
TestUtility.hex(cs) + ") =></TD><TD>" +
|
||||
targ + "</TD><TD>(" +
|
||||
TestUtility.hex(targ) + ") =></TD><TD><FONT SIZE=\"6\">" +
|
||||
reverse + "</TD><TD>(" +
|
||||
TestUtility.hex(reverse) + ")</TD></TR>" );
|
||||
if (++errorCount > errorLimit) return;
|
||||
} else if (showSuccess) {
|
||||
out.println(cs + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")"
|
||||
+ " => " + reverse + "(" + TestUtility.hex(reverse) + ")" );
|
||||
}
|
||||
}
|
||||
}
|
||||
if (showProgress) log.logln("");
|
||||
}
|
||||
|
||||
/*
|
||||
* Characters to filter for source-target mapping completeness
|
||||
* Typically is base alphabet, minus extended characters
|
||||
* Default is ASCII letters for Latin
|
||||
*/
|
||||
public boolean isSource(char c) {
|
||||
byte script = TestUtility.getScript(c);
|
||||
if (script != sourceScript) return false;
|
||||
if (!Character.isLetter(c)) return false;
|
||||
if (!sourceRange.contains(c)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Characters to check for target back to source mapping.
|
||||
* Typically the same as the target script, plus punctuation
|
||||
*/
|
||||
public boolean isReceivingSource(char c) {
|
||||
byte script = TestUtility.getScript(c);
|
||||
return (script == sourceScript || script == TestUtility.COMMON_SCRIPT);
|
||||
}
|
||||
|
||||
/*
|
||||
* Characters to filter for target-source mapping
|
||||
* Typically is base alphabet, minus extended characters
|
||||
*/
|
||||
public boolean isTarget(char c) {
|
||||
byte script = TestUtility.getScript(c);
|
||||
if (script != targetScript) return false;
|
||||
if (!Character.isLetter(c)) return false;
|
||||
if (targetRange != null && !targetRange.contains(c)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Characters to check for target-source mapping
|
||||
* Typically the same as the source script, plus punctuation
|
||||
*/
|
||||
public boolean isReceivingTarget(char c) {
|
||||
byte script = TestUtility.getScript(c);
|
||||
return (script == targetScript || script == TestUtility.COMMON_SCRIPT);
|
||||
}
|
||||
|
||||
|
||||
final boolean isSource(String s) {
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
if (!isSource(s.charAt(i))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
final boolean isTarget(String s) {
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
if (!isTarget(s.charAt(i))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
final boolean isReceivingSource(String s) {
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
if (!isReceivingSource(s.charAt(i))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
final boolean isReceivingTarget(String s) {
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
if (!isReceivingTarget(s.charAt(i))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
static class TestHangul extends Test {
|
||||
TestHangul () {
|
||||
super("Jamo-Hangul", TestUtility.JAMO_SCRIPT, TestUtility.HANGUL_SCRIPT);
|
||||
}
|
||||
|
||||
public boolean isSource(char c) {
|
||||
if (0x1113 <= c && c <= 0x1160) return false;
|
||||
if (0x1176 <= c && c <= 0x11F9) return false;
|
||||
if (0x3131 <= c && c <= 0x318E) return false;
|
||||
return super.isSource(c);
|
||||
}
|
||||
}
|
||||
}
|
379
icu4j/src/com/ibm/icu/dev/test/translit/TestUtility.java
Executable file
379
icu4j/src/com/ibm/icu/dev/test/translit/TestUtility.java
Executable file
|
@ -0,0 +1,379 @@
|
|||
package com.ibm.test.translit;
|
||||
public final class TestUtility {
|
||||
|
||||
public static byte getScript(char c) {
|
||||
return getScript(getBlock(c));
|
||||
}
|
||||
|
||||
public static byte getScript(byte block) {
|
||||
return blockToScript[block];
|
||||
}
|
||||
|
||||
public static byte getBlock(char c) {
|
||||
int index = c >> 7;
|
||||
byte block = charToBlock[index];
|
||||
while (block < 0) { // take care of exceptions, blocks split across 128 boundaries
|
||||
int[] tuple = split[-block-1];
|
||||
if (c < tuple[0]) block = (byte)tuple[1];
|
||||
else block = (byte)tuple[2];
|
||||
}
|
||||
return block;
|
||||
}
|
||||
|
||||
// returns next letter of script, or 0xFFFF if done
|
||||
|
||||
public static char getNextLetter(char c, byte script) {
|
||||
while (c < 0xFFFF) {
|
||||
++c;
|
||||
if (getScript(c) == script && Character.isLetter(c)) {
|
||||
return c;
|
||||
}
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
public static String hex(char ch) {
|
||||
String foo = Integer.toString(ch,16).toUpperCase();
|
||||
return "0000".substring(0,4-foo.length()) + foo;
|
||||
}
|
||||
|
||||
public static String hex(String s) {
|
||||
return hex(s,",");
|
||||
}
|
||||
|
||||
public static String hex(String s, String sep) {
|
||||
if (s.length() == 0) return "";
|
||||
String result = hex(s.charAt(0));
|
||||
for (int i = 1; i < s.length(); ++i) {
|
||||
result += sep;
|
||||
result += hex(s.charAt(i));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static void test() {
|
||||
System.out.println("Blocks: ");
|
||||
byte lastblock = -128;
|
||||
for (char cc = 0; cc < 0xFFFF; ++cc) {
|
||||
byte block = TestUtility.getBlock(cc);
|
||||
if (block != lastblock) {
|
||||
System.out.println(TestUtility.hex(cc) + "\t" + block);
|
||||
lastblock = block;
|
||||
}
|
||||
}
|
||||
System.out.println();
|
||||
System.out.println("Scripts: ");
|
||||
byte lastScript = -128;
|
||||
for (char cc = 0; cc < 0xFFFF; ++cc) {
|
||||
byte script = TestUtility.getScript(cc);
|
||||
if (script != lastScript) {
|
||||
System.out.println(TestUtility.hex(cc) + "\t" + script);
|
||||
lastScript = script;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static final byte // SCRIPT CODE
|
||||
COMMON_SCRIPT = 0,
|
||||
LATIN_SCRIPT = 1,
|
||||
GREEK_SCRIPT = 2,
|
||||
CYRILLIC_SCRIPT = 3,
|
||||
ARMENIAN_SCRIPT = 4,
|
||||
HEBREW_SCRIPT = 5,
|
||||
ARABIC_SCRIPT = 6,
|
||||
SYRIAC_SCRIPT = 7,
|
||||
THAANA_SCRIPT = 8,
|
||||
DEVANAGARI_SCRIPT = 9,
|
||||
BENGALI_SCRIPT = 10,
|
||||
GURMUKHI_SCRIPT = 11,
|
||||
GUJARATI_SCRIPT = 12,
|
||||
ORIYA_SCRIPT = 13,
|
||||
TAMIL_SCRIPT = 14,
|
||||
TELUGU_SCRIPT = 15,
|
||||
KANNADA_SCRIPT = 16,
|
||||
MALAYALAM_SCRIPT = 17,
|
||||
SINHALA_SCRIPT = 18,
|
||||
THAI_SCRIPT = 19,
|
||||
LAO_SCRIPT = 20,
|
||||
TIBETAN_SCRIPT = 21,
|
||||
MYANMAR_SCRIPT = 22,
|
||||
GEORGIAN_SCRIPT = 23,
|
||||
JAMO_SCRIPT = 24,
|
||||
HANGUL_SCRIPT = 25,
|
||||
ETHIOPIC_SCRIPT = 26,
|
||||
CHEROKEE_SCRIPT = 27,
|
||||
ABORIGINAL_SCRIPT = 28,
|
||||
OGHAM_SCRIPT = 29,
|
||||
RUNIC_SCRIPT = 30,
|
||||
KHMER_SCRIPT = 31,
|
||||
MONGOLIAN_SCRIPT = 32,
|
||||
HIRAGANA_SCRIPT = 33,
|
||||
KATAKANA_SCRIPT = 34,
|
||||
BOPOMOFO_SCRIPT = 35,
|
||||
HAN_SCRIPT = 36,
|
||||
YI_SCRIPT = 37;
|
||||
|
||||
public static final byte // block code
|
||||
RESERVED_BLOCK = 0,
|
||||
BASIC_LATIN = 1,
|
||||
LATIN_1_SUPPLEMENT = 2,
|
||||
LATIN_EXTENDED_A = 3,
|
||||
LATIN_EXTENDED_B = 4,
|
||||
IPA_EXTENSIONS = 5,
|
||||
SPACING_MODIFIER_LETTERS = 6,
|
||||
COMBINING_DIACRITICAL_MARKS = 7,
|
||||
GREEK = 8,
|
||||
CYRILLIC = 9,
|
||||
ARMENIAN = 10,
|
||||
HEBREW = 11,
|
||||
ARABIC = 12,
|
||||
SYRIAC = 13,
|
||||
THAANA = 14,
|
||||
DEVANAGARI = 15,
|
||||
BENGALI = 16,
|
||||
GURMUKHI = 17,
|
||||
GUJARATI = 18,
|
||||
ORIYA = 19,
|
||||
TAMIL = 20,
|
||||
TELUGU = 21,
|
||||
KANNADA = 22,
|
||||
MALAYALAM = 23,
|
||||
SINHALA = 24,
|
||||
THAI = 25,
|
||||
LAO = 26,
|
||||
TIBETAN = 27,
|
||||
MYANMAR = 28,
|
||||
GEORGIAN = 29,
|
||||
HANGUL_JAMO = 30,
|
||||
ETHIOPIC = 31,
|
||||
CHEROKEE = 32,
|
||||
UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33,
|
||||
OGHAM = 34,
|
||||
RUNIC = 35,
|
||||
KHMER = 36,
|
||||
MONGOLIAN = 37,
|
||||
LATIN_EXTENDED_ADDITIONAL = 38,
|
||||
GREEK_EXTENDED = 39,
|
||||
GENERAL_PUNCTUATION = 40,
|
||||
SUPERSCRIPTS_AND_SUBSCRIPTS = 41,
|
||||
CURRENCY_SYMBOLS = 42,
|
||||
COMBINING_MARKS_FOR_SYMBOLS = 43,
|
||||
LETTERLIKE_SYMBOLS = 44,
|
||||
NUMBER_FORMS = 45,
|
||||
ARROWS = 46,
|
||||
MATHEMATICAL_OPERATORS = 47,
|
||||
MISCELLANEOUS_TECHNICAL = 48,
|
||||
CONTROL_PICTURES = 49,
|
||||
OPTICAL_CHARACTER_RECOGNITION = 50,
|
||||
ENCLOSED_ALPHANUMERICS = 51,
|
||||
BOX_DRAWING = 52,
|
||||
BLOCK_ELEMENTS = 53,
|
||||
GEOMETRIC_SHAPES = 54,
|
||||
MISCELLANEOUS_SYMBOLS = 55,
|
||||
DINGBATS = 56,
|
||||
BRAILLE_PATTERNS = 57,
|
||||
CJK_RADICALS_SUPPLEMENT = 58,
|
||||
KANGXI_RADICALS = 59,
|
||||
IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60,
|
||||
CJK_SYMBOLS_AND_PUNCTUATION = 61,
|
||||
HIRAGANA = 62,
|
||||
KATAKANA = 63,
|
||||
BOPOMOFO = 64,
|
||||
HANGUL_COMPATIBILITY_JAMO = 65,
|
||||
KANBUN = 66,
|
||||
BOPOMOFO_EXTENDED = 67,
|
||||
ENCLOSED_CJK_LETTERS_AND_MONTHS = 68,
|
||||
CJK_COMPATIBILITY = 69,
|
||||
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70,
|
||||
CJK_UNIFIED_IDEOGRAPHS = 71,
|
||||
YI_SYLLABLES = 72,
|
||||
YI_RADICALS = 73,
|
||||
HANGUL_SYLLABLES = 74,
|
||||
HIGH_SURROGATES = 75,
|
||||
HIGH_PRIVATE_USE_SURROGATES = 76,
|
||||
LOW_SURROGATES = 77,
|
||||
PRIVATE_USE = 78,
|
||||
CJK_COMPATIBILITY_IDEOGRAPHS = 79,
|
||||
ALPHABETIC_PRESENTATION_FORMS = 80,
|
||||
ARABIC_PRESENTATION_FORMS_A = 81,
|
||||
COMBINING_HALF_MARKS = 82,
|
||||
CJK_COMPATIBILITY_FORMS = 83,
|
||||
SMALL_FORM_VARIANTS = 84,
|
||||
ARABIC_PRESENTATION_FORMS_B = 85,
|
||||
SPECIALS = 86,
|
||||
HALFWIDTH_AND_FULLWIDTH_FORMS = 87;
|
||||
|
||||
static final byte[] blockToScript = {
|
||||
COMMON_SCRIPT, // 0, <RESERVED_BLOCK>
|
||||
LATIN_SCRIPT, // 1, BASIC_LATIN
|
||||
LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT
|
||||
LATIN_SCRIPT, // 3, LATIN_EXTENDED_A
|
||||
LATIN_SCRIPT, // 4, LATIN_EXTENDED_B
|
||||
LATIN_SCRIPT, // 5, IPA_EXTENSIONS
|
||||
COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS
|
||||
COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS
|
||||
GREEK_SCRIPT, // 8, GREEK
|
||||
CYRILLIC_SCRIPT, // 9, CYRILLIC
|
||||
ARMENIAN_SCRIPT, // 10, ARMENIAN
|
||||
HEBREW_SCRIPT, // 11, HEBREW
|
||||
ARABIC_SCRIPT, // 12, ARABIC
|
||||
SYRIAC_SCRIPT, // 13, SYRIAC
|
||||
THAANA_SCRIPT, // 14, THAANA
|
||||
DEVANAGARI_SCRIPT, // 15, DEVANAGARI
|
||||
BENGALI_SCRIPT, // 16, BENGALI
|
||||
GURMUKHI_SCRIPT, // 17, GURMUKHI
|
||||
GUJARATI_SCRIPT, // 18, GUJARATI
|
||||
ORIYA_SCRIPT, // 19, ORIYA
|
||||
TAMIL_SCRIPT, // 20, TAMIL
|
||||
TELUGU_SCRIPT, // 21, TELUGU
|
||||
KANNADA_SCRIPT, // 22, KANNADA
|
||||
MALAYALAM_SCRIPT, // 23, MALAYALAM
|
||||
SINHALA_SCRIPT, // 24, SINHALA
|
||||
THAI_SCRIPT, // 25, THAI
|
||||
LAO_SCRIPT, // 26, LAO
|
||||
TIBETAN_SCRIPT, // 27, TIBETAN
|
||||
MYANMAR_SCRIPT, // 28, MYANMAR
|
||||
GEORGIAN_SCRIPT, // 29, GEORGIAN
|
||||
JAMO_SCRIPT, // 30, HANGUL_JAMO
|
||||
ETHIOPIC_SCRIPT, // 31, ETHIOPIC
|
||||
CHEROKEE_SCRIPT, // 32, CHEROKEE
|
||||
ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
|
||||
OGHAM_SCRIPT, // 34, OGHAM
|
||||
RUNIC_SCRIPT, // 35, RUNIC
|
||||
KHMER_SCRIPT, // 36, KHMER
|
||||
MONGOLIAN_SCRIPT, // 37, MONGOLIAN
|
||||
LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL
|
||||
GREEK_SCRIPT, // 39, GREEK_EXTENDED
|
||||
COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION
|
||||
COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS
|
||||
COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS
|
||||
COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS
|
||||
COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS
|
||||
COMMON_SCRIPT, // 45, NUMBER_FORMS
|
||||
COMMON_SCRIPT, // 46, ARROWS
|
||||
COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS
|
||||
COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL
|
||||
COMMON_SCRIPT, // 49, CONTROL_PICTURES
|
||||
COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION
|
||||
COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS
|
||||
COMMON_SCRIPT, // 52, BOX_DRAWING
|
||||
COMMON_SCRIPT, // 53, BLOCK_ELEMENTS
|
||||
COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES
|
||||
COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS
|
||||
COMMON_SCRIPT, // 56, DINGBATS
|
||||
COMMON_SCRIPT, // 57, BRAILLE_PATTERNS
|
||||
HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT
|
||||
HAN_SCRIPT, // 59, KANGXI_RADICALS
|
||||
HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS
|
||||
COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION
|
||||
HIRAGANA_SCRIPT, // 62, HIRAGANA
|
||||
KATAKANA_SCRIPT, // 63, KATAKANA
|
||||
BOPOMOFO_SCRIPT, // 64, BOPOMOFO
|
||||
JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO
|
||||
HAN_SCRIPT, // 66, KANBUN
|
||||
BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED
|
||||
COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS
|
||||
COMMON_SCRIPT, // 69, CJK_COMPATIBILITY
|
||||
HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|
||||
HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS
|
||||
YI_SCRIPT, // 72, YI_SYLLABLES
|
||||
YI_SCRIPT, // 73, YI_RADICALS
|
||||
HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES
|
||||
COMMON_SCRIPT, // 75, HIGH_SURROGATES
|
||||
COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES
|
||||
COMMON_SCRIPT, // 77, LOW_SURROGATES
|
||||
COMMON_SCRIPT, // 78, PRIVATE_USE
|
||||
HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS
|
||||
COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS
|
||||
ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A
|
||||
COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS
|
||||
COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS
|
||||
COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS
|
||||
ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B
|
||||
COMMON_SCRIPT, // 86, SPECIALS
|
||||
COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS
|
||||
COMMON_SCRIPT, // 88, SPECIALS
|
||||
};
|
||||
|
||||
// could be further reduced to a byte array, but I didn't bother.
|
||||
static final int[][] split = {
|
||||
{0x0250, 4, 5}, // -1
|
||||
{0x02B0, 5, 6}, // -2
|
||||
{0x0370, 7, 8}, // -3
|
||||
{0x0530, 0, 10}, // -4
|
||||
{0x0590, 10, 11}, // -5
|
||||
{0x0750, 13, 0}, // -6
|
||||
{0x07C0, 14, 0}, // -7
|
||||
{0x10A0, 28, 29}, // -8
|
||||
{0x13A0, 0, 32}, // -9
|
||||
{0x16A0, 34, 35}, // -10
|
||||
{0x18B0, 37, 0}, // -11
|
||||
{0x2070, 40, 41}, // -12
|
||||
{0x20A0, 41, -31}, // -13
|
||||
{0x2150, 44, 45}, // -14
|
||||
{0x2190, 45, 46}, // -15
|
||||
{0x2440, 49, -32}, // -16
|
||||
{0x25A0, 53, 54}, // -17
|
||||
{0x27C0, 56, 0}, // -18
|
||||
{0x2FE0, 59, -33}, // -19
|
||||
{0x3040, 61, 62}, // -20
|
||||
{0x30A0, 62, 63}, // -21
|
||||
{0x3130, 64, 65}, // -22
|
||||
{0x3190, 65, -34}, // -23
|
||||
{0x4DB6, 70, 0}, // -24
|
||||
{0xA490, 72, -35}, // -25
|
||||
{0xD7A4, 74, 0}, // -26
|
||||
{0xFB50, 80, 81}, // -27
|
||||
{0xFE20, 0, -36}, // -28
|
||||
{0xFEFF, 85, 86}, // -29
|
||||
{0xFFF0, 87, -37}, // -30
|
||||
{0x20D0, 42, 43}, // -31
|
||||
{0x2460, 50, 51}, // -32
|
||||
{0x2FF0, 0, 60}, // -33
|
||||
{0x31A0, 66, -38}, // -34
|
||||
{0xA4D0, 73, 0}, //-35
|
||||
{0xFE30, 82, -39}, //-36
|
||||
{0xFFFE, 88, 0}, //-37
|
||||
{0x31C0, 67, 0}, // -38
|
||||
{0xFE50, 83, -40}, //-39
|
||||
{0xFE70, 84, 85} // -40
|
||||
};
|
||||
|
||||
static final byte[] charToBlock = {
|
||||
1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7,
|
||||
0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27,
|
||||
28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36,
|
||||
37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39,
|
||||
-12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18,
|
||||
57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19,
|
||||
-20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70,
|
||||
70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
|
||||
70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
|
||||
70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74,
|
||||
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
|
||||
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
|
||||
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
|
||||
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
|
||||
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26,
|
||||
75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77,
|
||||
78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
|
||||
78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
|
||||
78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
|
||||
78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30
|
||||
};
|
||||
|
||||
}
|
370
icu4j/src/com/ibm/test/translit/RoundTripTest.java
Executable file
370
icu4j/src/com/ibm/test/translit/RoundTripTest.java
Executable file
|
@ -0,0 +1,370 @@
|
|||
package com.ibm.test.translit;
|
||||
import com.ibm.test.*;
|
||||
import com.ibm.text.*;
|
||||
import java.io.*;
|
||||
import java.text.ParseException;
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @summary Round trip test of Transliterator
|
||||
*/
|
||||
public class RoundTripTest extends TestFmwk {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
new RoundTripTest().run(args);
|
||||
}
|
||||
|
||||
public void TestRoundTrip() throws IOException, ParseException {
|
||||
Test t;
|
||||
|
||||
// Test Hiragana
|
||||
new Test("Latin-Kana",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.HIRAGANA_SCRIPT)
|
||||
.test("[a-z]", "[\u3040-\u3094]", this);
|
||||
|
||||
// Test Katakana
|
||||
new Test("Latin-Kana",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.KATAKANA_SCRIPT)
|
||||
.test("[A-Z]", "[\u30A1-\u30FA]", this);
|
||||
|
||||
// Test Arabic
|
||||
new Test("Latin-Arabic",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.ARABIC_SCRIPT)
|
||||
.test(null, "[\u0620-\u065F-[\u0640]]", this);
|
||||
|
||||
// Test Hebrew
|
||||
new Test("Latin-Hebrew",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.HEBREW_SCRIPT)
|
||||
.test(null, "[\u05D0-\u05EF]", this);
|
||||
|
||||
// Test Hangul
|
||||
t = new TestHangul();
|
||||
t.setPairLimit(30); // Don't run full test -- too long
|
||||
t.test(null, null, this);
|
||||
|
||||
// Test Jamo
|
||||
t = new Test("Latin-Jamo",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.JAMO_SCRIPT);
|
||||
t.setErrorLimit(100);
|
||||
t.test(null, null, this);
|
||||
|
||||
// Test JamoHangul
|
||||
t = new Test("Latin-Jamo;Jamo-Hangul",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.HANGUL_SCRIPT);
|
||||
t.setErrorLimit(100);
|
||||
t.test(null, null, this);
|
||||
|
||||
// Test Greek
|
||||
new Test("Latin-Greek",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.GREEK_SCRIPT)
|
||||
.test(null, "[\u0380-\u03CF]", this);
|
||||
|
||||
// Test Cyrillic
|
||||
new Test("Latin-Cyrillic",
|
||||
TestUtility.LATIN_SCRIPT, TestUtility.CYRILLIC_SCRIPT)
|
||||
.test(null, "[\u0401\u0410-\u0451]", this);
|
||||
|
||||
// Test Utility
|
||||
// TestUtility.test(); // dump blocks and scripts for debugging
|
||||
}
|
||||
|
||||
static class Test {
|
||||
|
||||
PrintWriter out;
|
||||
|
||||
private String transliteratorID;
|
||||
private byte sourceScript;
|
||||
private byte targetScript;
|
||||
private boolean showProgress = true;
|
||||
private boolean showSuccess = false;
|
||||
private int errorLimit = Integer.MAX_VALUE;
|
||||
private int errorCount = 0;
|
||||
private int pairLimit = 0x10000;
|
||||
UnicodeSet sourceRange;
|
||||
UnicodeSet targetRange;
|
||||
TestLog log;
|
||||
|
||||
/*
|
||||
* create a test for the given script transliterator.
|
||||
*/
|
||||
Test(String transliteratorID,
|
||||
byte sourceScript, byte targetScript) {
|
||||
this.transliteratorID = transliteratorID;
|
||||
this.sourceScript = sourceScript;
|
||||
this.targetScript = targetScript;
|
||||
}
|
||||
|
||||
public void setErrorLimit(int limit) {
|
||||
errorLimit = limit;
|
||||
}
|
||||
|
||||
public void setPairLimit(int limit) {
|
||||
pairLimit = limit;
|
||||
}
|
||||
|
||||
public void test(String sourceRange, String targetRange, TestLog log)
|
||||
throws java.io.IOException, java.text.ParseException {
|
||||
|
||||
if (sourceRange != null && sourceRange.length() > 0) {
|
||||
this.sourceRange = new UnicodeSet(sourceRange);
|
||||
}
|
||||
if (targetRange != null && targetRange.length() > 0) {
|
||||
this.targetRange = new UnicodeSet(targetRange);
|
||||
}
|
||||
|
||||
if (this.sourceRange == null) this.sourceRange = new UnicodeSet("[a-Z]");
|
||||
|
||||
this.log = log;
|
||||
|
||||
// make a UTF-8 output file we can read with a browser
|
||||
|
||||
// note: check that every transliterator transliterates the null string correctly!
|
||||
|
||||
String logFileName = "test_" + transliteratorID + "_"
|
||||
+ sourceScript + "_" + targetScript + ".html";
|
||||
|
||||
log.logln("Creating log file " + logFileName);
|
||||
|
||||
out = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(logFileName),
|
||||
"UTF8"),
|
||||
4*1024));
|
||||
//out.write('\uFFEF'); // BOM
|
||||
out.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">");
|
||||
out.println("<HTML><HEAD>");
|
||||
out.println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>");
|
||||
out.println("<BODY>");
|
||||
out.println("<TABLE>");
|
||||
test2();
|
||||
out.println("</TABLE></BODY></HTML>");
|
||||
out.close();
|
||||
|
||||
if (errorCount > 0) {
|
||||
log.errln(transliteratorID + " errors: " + errorCount);
|
||||
} else {
|
||||
log.logln(transliteratorID + " ok");
|
||||
new File(logFileName).delete();
|
||||
}
|
||||
}
|
||||
|
||||
public void test2() {
|
||||
int count = 0;
|
||||
|
||||
Transliterator sourceToTarget = Transliterator.getInstance(transliteratorID);
|
||||
Transliterator targetToSource = sourceToTarget.getInverse();
|
||||
|
||||
log.logln("Checking that all source characters convert to target - Singles");
|
||||
|
||||
// check single letters
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if (Character.getType(c) == Character.UNASSIGNED) continue;
|
||||
if (!isSource(c)) continue;
|
||||
//if (showProgress && (count++ % 100) == 0) {
|
||||
// log.logln(count + ": " + TestUtility.hex(c));
|
||||
//}
|
||||
String cs = String.valueOf(c);
|
||||
String targ = sourceToTarget.transliterate(String.valueOf(cs));
|
||||
if (!isReceivingTarget(targ)) {
|
||||
out.println("<TR><TD>Fail Source-Target: " + cs + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")</TD></TR>");
|
||||
if (++errorCount > errorLimit) return;
|
||||
} else if (showSuccess) {
|
||||
out.println(c + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")");
|
||||
}
|
||||
}
|
||||
|
||||
log.logln("Checking that all source characters convert to target - Doubles");
|
||||
count = 0;
|
||||
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if (Character.getType(c) == Character.UNASSIGNED) continue;
|
||||
if (!isSource(c)) continue;
|
||||
for (char d = 0; d < 0xFFFF; ++d) {
|
||||
if (Character.getType(d) == Character.UNASSIGNED) continue;
|
||||
if (!isSource(d)) continue;
|
||||
String cs = String.valueOf(c) + d;
|
||||
//if (showProgress && (count++ % 1000) == 0) {
|
||||
// log.logln(count + ": " + TestUtility.hex(cs));
|
||||
//}
|
||||
String targ = sourceToTarget.transliterate(cs);
|
||||
if (!isReceivingTarget(targ)) {
|
||||
out.println("<TR><TD>Fail Source-Target: " + cs + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")</TR></TD>");
|
||||
if (++errorCount > errorLimit) return;
|
||||
} else if (showSuccess) {
|
||||
out.println(c + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.logln("Checking that target characters convert to source and back - Singles");
|
||||
count = 0;
|
||||
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if (Character.getType(c) == Character.UNASSIGNED) continue;
|
||||
if (!isTarget(c)) continue;
|
||||
//if (showProgress && (count++ % 100) == 0) {
|
||||
// log.logln(count + ": " + TestUtility.hex(c));
|
||||
//}
|
||||
String cs = String.valueOf(c);
|
||||
if (c > 0x0400) {
|
||||
cs = cs + "";
|
||||
}
|
||||
String targ = targetToSource.transliterate(cs);
|
||||
String reverse = sourceToTarget.transliterate(targ);
|
||||
if (!isReceivingSource(targ)) {
|
||||
out.println("<TR><TD>Fail Target-Source: " + cs + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")</TR></TD>" );
|
||||
if (++errorCount > errorLimit) return;
|
||||
} else if (!cs.equals(reverse)) {
|
||||
out.println("<TR><TD>Fail Roundtrip:</TD><TD><FONT SIZE=\"6\">" +
|
||||
cs + "</FONT></TD><TD>(" +
|
||||
TestUtility.hex(cs) + ") =></TD><TD>" +
|
||||
targ + "</TD><TD>(" +
|
||||
TestUtility.hex(targ) + ") =></TD><TD><FONT SIZE=\"6\">" +
|
||||
reverse + "</TD><TD>(" +
|
||||
TestUtility.hex(reverse) + ")</TD></TR>" );
|
||||
if (++errorCount > errorLimit) return;
|
||||
} else if (showSuccess) {
|
||||
out.println(cs + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")"
|
||||
+ " => " + reverse + "(" + TestUtility.hex(reverse) + ")" );
|
||||
}
|
||||
}
|
||||
|
||||
log.logln("Checking that target characters convert to source and back - Doubles");
|
||||
count = 0;
|
||||
|
||||
StringBuffer buf = new StringBuffer("aa");
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if (Character.getType(c) == Character.UNASSIGNED) continue;
|
||||
if (!isTarget(c)) continue;
|
||||
if (++count > pairLimit) {
|
||||
out.println("<TR><TD>Test truncated at " + pairLimit + " x 64k pairs</TR></TD>");
|
||||
break;
|
||||
}
|
||||
buf.setCharAt(0, c);
|
||||
if (showProgress) { // && (count++ % 10000) == 0) {
|
||||
log.log(TestUtility.hex(c));
|
||||
// count + ": " + TestUtility.hex(cs));
|
||||
}
|
||||
for (char d = 0; d < 0xFFFF; ++d) {
|
||||
if (Character.getType(d) == Character.UNASSIGNED) continue;
|
||||
if (!isTarget(d)) continue;
|
||||
buf.setCharAt(1, d);
|
||||
String cs = buf.toString();
|
||||
String targ = targetToSource.transliterate(cs);
|
||||
String reverse = sourceToTarget.transliterate(targ);
|
||||
if (!isReceivingSource(targ)) {
|
||||
out.println("<TR><TD>Fail Target-Source: " + cs + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")</TR></TD>" );
|
||||
if (++errorCount > errorLimit) return;
|
||||
} else if (!cs.equals(reverse)) {
|
||||
out.println("<TR><TD>Fail Roundtrip:</TD><TD><FONT SIZE=\"6\">" +
|
||||
cs + "</FONT></TD><TD>(" +
|
||||
TestUtility.hex(cs) + ") =></TD><TD>" +
|
||||
targ + "</TD><TD>(" +
|
||||
TestUtility.hex(targ) + ") =></TD><TD><FONT SIZE=\"6\">" +
|
||||
reverse + "</TD><TD>(" +
|
||||
TestUtility.hex(reverse) + ")</TD></TR>" );
|
||||
if (++errorCount > errorLimit) return;
|
||||
} else if (showSuccess) {
|
||||
out.println(cs + "(" + TestUtility.hex(cs) + ")"
|
||||
+ " => " + targ + "(" + TestUtility.hex(targ) + ")"
|
||||
+ " => " + reverse + "(" + TestUtility.hex(reverse) + ")" );
|
||||
}
|
||||
}
|
||||
}
|
||||
if (showProgress) log.logln("");
|
||||
}
|
||||
|
||||
/*
|
||||
* Characters to filter for source-target mapping completeness
|
||||
* Typically is base alphabet, minus extended characters
|
||||
* Default is ASCII letters for Latin
|
||||
*/
|
||||
public boolean isSource(char c) {
|
||||
byte script = TestUtility.getScript(c);
|
||||
if (script != sourceScript) return false;
|
||||
if (!Character.isLetter(c)) return false;
|
||||
if (!sourceRange.contains(c)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Characters to check for target back to source mapping.
|
||||
* Typically the same as the target script, plus punctuation
|
||||
*/
|
||||
public boolean isReceivingSource(char c) {
|
||||
byte script = TestUtility.getScript(c);
|
||||
return (script == sourceScript || script == TestUtility.COMMON_SCRIPT);
|
||||
}
|
||||
|
||||
/*
|
||||
* Characters to filter for target-source mapping
|
||||
* Typically is base alphabet, minus extended characters
|
||||
*/
|
||||
public boolean isTarget(char c) {
|
||||
byte script = TestUtility.getScript(c);
|
||||
if (script != targetScript) return false;
|
||||
if (!Character.isLetter(c)) return false;
|
||||
if (targetRange != null && !targetRange.contains(c)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Characters to check for target-source mapping
|
||||
* Typically the same as the source script, plus punctuation
|
||||
*/
|
||||
public boolean isReceivingTarget(char c) {
|
||||
byte script = TestUtility.getScript(c);
|
||||
return (script == targetScript || script == TestUtility.COMMON_SCRIPT);
|
||||
}
|
||||
|
||||
|
||||
final boolean isSource(String s) {
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
if (!isSource(s.charAt(i))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
final boolean isTarget(String s) {
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
if (!isTarget(s.charAt(i))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
final boolean isReceivingSource(String s) {
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
if (!isReceivingSource(s.charAt(i))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
final boolean isReceivingTarget(String s) {
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
if (!isReceivingTarget(s.charAt(i))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
static class TestHangul extends Test {
|
||||
TestHangul () {
|
||||
super("Jamo-Hangul", TestUtility.JAMO_SCRIPT, TestUtility.HANGUL_SCRIPT);
|
||||
}
|
||||
|
||||
public boolean isSource(char c) {
|
||||
if (0x1113 <= c && c <= 0x1160) return false;
|
||||
if (0x1176 <= c && c <= 0x11F9) return false;
|
||||
if (0x3131 <= c && c <= 0x318E) return false;
|
||||
return super.isSource(c);
|
||||
}
|
||||
}
|
||||
}
|
379
icu4j/src/com/ibm/test/translit/TestUtility.java
Executable file
379
icu4j/src/com/ibm/test/translit/TestUtility.java
Executable file
|
@ -0,0 +1,379 @@
|
|||
package com.ibm.test.translit;
|
||||
public final class TestUtility {
|
||||
|
||||
public static byte getScript(char c) {
|
||||
return getScript(getBlock(c));
|
||||
}
|
||||
|
||||
public static byte getScript(byte block) {
|
||||
return blockToScript[block];
|
||||
}
|
||||
|
||||
public static byte getBlock(char c) {
|
||||
int index = c >> 7;
|
||||
byte block = charToBlock[index];
|
||||
while (block < 0) { // take care of exceptions, blocks split across 128 boundaries
|
||||
int[] tuple = split[-block-1];
|
||||
if (c < tuple[0]) block = (byte)tuple[1];
|
||||
else block = (byte)tuple[2];
|
||||
}
|
||||
return block;
|
||||
}
|
||||
|
||||
// returns next letter of script, or 0xFFFF if done
|
||||
|
||||
public static char getNextLetter(char c, byte script) {
|
||||
while (c < 0xFFFF) {
|
||||
++c;
|
||||
if (getScript(c) == script && Character.isLetter(c)) {
|
||||
return c;
|
||||
}
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
public static String hex(char ch) {
|
||||
String foo = Integer.toString(ch,16).toUpperCase();
|
||||
return "0000".substring(0,4-foo.length()) + foo;
|
||||
}
|
||||
|
||||
public static String hex(String s) {
|
||||
return hex(s,",");
|
||||
}
|
||||
|
||||
public static String hex(String s, String sep) {
|
||||
if (s.length() == 0) return "";
|
||||
String result = hex(s.charAt(0));
|
||||
for (int i = 1; i < s.length(); ++i) {
|
||||
result += sep;
|
||||
result += hex(s.charAt(i));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static void test() {
|
||||
System.out.println("Blocks: ");
|
||||
byte lastblock = -128;
|
||||
for (char cc = 0; cc < 0xFFFF; ++cc) {
|
||||
byte block = TestUtility.getBlock(cc);
|
||||
if (block != lastblock) {
|
||||
System.out.println(TestUtility.hex(cc) + "\t" + block);
|
||||
lastblock = block;
|
||||
}
|
||||
}
|
||||
System.out.println();
|
||||
System.out.println("Scripts: ");
|
||||
byte lastScript = -128;
|
||||
for (char cc = 0; cc < 0xFFFF; ++cc) {
|
||||
byte script = TestUtility.getScript(cc);
|
||||
if (script != lastScript) {
|
||||
System.out.println(TestUtility.hex(cc) + "\t" + script);
|
||||
lastScript = script;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static final byte // SCRIPT CODE
|
||||
COMMON_SCRIPT = 0,
|
||||
LATIN_SCRIPT = 1,
|
||||
GREEK_SCRIPT = 2,
|
||||
CYRILLIC_SCRIPT = 3,
|
||||
ARMENIAN_SCRIPT = 4,
|
||||
HEBREW_SCRIPT = 5,
|
||||
ARABIC_SCRIPT = 6,
|
||||
SYRIAC_SCRIPT = 7,
|
||||
THAANA_SCRIPT = 8,
|
||||
DEVANAGARI_SCRIPT = 9,
|
||||
BENGALI_SCRIPT = 10,
|
||||
GURMUKHI_SCRIPT = 11,
|
||||
GUJARATI_SCRIPT = 12,
|
||||
ORIYA_SCRIPT = 13,
|
||||
TAMIL_SCRIPT = 14,
|
||||
TELUGU_SCRIPT = 15,
|
||||
KANNADA_SCRIPT = 16,
|
||||
MALAYALAM_SCRIPT = 17,
|
||||
SINHALA_SCRIPT = 18,
|
||||
THAI_SCRIPT = 19,
|
||||
LAO_SCRIPT = 20,
|
||||
TIBETAN_SCRIPT = 21,
|
||||
MYANMAR_SCRIPT = 22,
|
||||
GEORGIAN_SCRIPT = 23,
|
||||
JAMO_SCRIPT = 24,
|
||||
HANGUL_SCRIPT = 25,
|
||||
ETHIOPIC_SCRIPT = 26,
|
||||
CHEROKEE_SCRIPT = 27,
|
||||
ABORIGINAL_SCRIPT = 28,
|
||||
OGHAM_SCRIPT = 29,
|
||||
RUNIC_SCRIPT = 30,
|
||||
KHMER_SCRIPT = 31,
|
||||
MONGOLIAN_SCRIPT = 32,
|
||||
HIRAGANA_SCRIPT = 33,
|
||||
KATAKANA_SCRIPT = 34,
|
||||
BOPOMOFO_SCRIPT = 35,
|
||||
HAN_SCRIPT = 36,
|
||||
YI_SCRIPT = 37;
|
||||
|
||||
public static final byte // block code
|
||||
RESERVED_BLOCK = 0,
|
||||
BASIC_LATIN = 1,
|
||||
LATIN_1_SUPPLEMENT = 2,
|
||||
LATIN_EXTENDED_A = 3,
|
||||
LATIN_EXTENDED_B = 4,
|
||||
IPA_EXTENSIONS = 5,
|
||||
SPACING_MODIFIER_LETTERS = 6,
|
||||
COMBINING_DIACRITICAL_MARKS = 7,
|
||||
GREEK = 8,
|
||||
CYRILLIC = 9,
|
||||
ARMENIAN = 10,
|
||||
HEBREW = 11,
|
||||
ARABIC = 12,
|
||||
SYRIAC = 13,
|
||||
THAANA = 14,
|
||||
DEVANAGARI = 15,
|
||||
BENGALI = 16,
|
||||
GURMUKHI = 17,
|
||||
GUJARATI = 18,
|
||||
ORIYA = 19,
|
||||
TAMIL = 20,
|
||||
TELUGU = 21,
|
||||
KANNADA = 22,
|
||||
MALAYALAM = 23,
|
||||
SINHALA = 24,
|
||||
THAI = 25,
|
||||
LAO = 26,
|
||||
TIBETAN = 27,
|
||||
MYANMAR = 28,
|
||||
GEORGIAN = 29,
|
||||
HANGUL_JAMO = 30,
|
||||
ETHIOPIC = 31,
|
||||
CHEROKEE = 32,
|
||||
UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33,
|
||||
OGHAM = 34,
|
||||
RUNIC = 35,
|
||||
KHMER = 36,
|
||||
MONGOLIAN = 37,
|
||||
LATIN_EXTENDED_ADDITIONAL = 38,
|
||||
GREEK_EXTENDED = 39,
|
||||
GENERAL_PUNCTUATION = 40,
|
||||
SUPERSCRIPTS_AND_SUBSCRIPTS = 41,
|
||||
CURRENCY_SYMBOLS = 42,
|
||||
COMBINING_MARKS_FOR_SYMBOLS = 43,
|
||||
LETTERLIKE_SYMBOLS = 44,
|
||||
NUMBER_FORMS = 45,
|
||||
ARROWS = 46,
|
||||
MATHEMATICAL_OPERATORS = 47,
|
||||
MISCELLANEOUS_TECHNICAL = 48,
|
||||
CONTROL_PICTURES = 49,
|
||||
OPTICAL_CHARACTER_RECOGNITION = 50,
|
||||
ENCLOSED_ALPHANUMERICS = 51,
|
||||
BOX_DRAWING = 52,
|
||||
BLOCK_ELEMENTS = 53,
|
||||
GEOMETRIC_SHAPES = 54,
|
||||
MISCELLANEOUS_SYMBOLS = 55,
|
||||
DINGBATS = 56,
|
||||
BRAILLE_PATTERNS = 57,
|
||||
CJK_RADICALS_SUPPLEMENT = 58,
|
||||
KANGXI_RADICALS = 59,
|
||||
IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60,
|
||||
CJK_SYMBOLS_AND_PUNCTUATION = 61,
|
||||
HIRAGANA = 62,
|
||||
KATAKANA = 63,
|
||||
BOPOMOFO = 64,
|
||||
HANGUL_COMPATIBILITY_JAMO = 65,
|
||||
KANBUN = 66,
|
||||
BOPOMOFO_EXTENDED = 67,
|
||||
ENCLOSED_CJK_LETTERS_AND_MONTHS = 68,
|
||||
CJK_COMPATIBILITY = 69,
|
||||
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70,
|
||||
CJK_UNIFIED_IDEOGRAPHS = 71,
|
||||
YI_SYLLABLES = 72,
|
||||
YI_RADICALS = 73,
|
||||
HANGUL_SYLLABLES = 74,
|
||||
HIGH_SURROGATES = 75,
|
||||
HIGH_PRIVATE_USE_SURROGATES = 76,
|
||||
LOW_SURROGATES = 77,
|
||||
PRIVATE_USE = 78,
|
||||
CJK_COMPATIBILITY_IDEOGRAPHS = 79,
|
||||
ALPHABETIC_PRESENTATION_FORMS = 80,
|
||||
ARABIC_PRESENTATION_FORMS_A = 81,
|
||||
COMBINING_HALF_MARKS = 82,
|
||||
CJK_COMPATIBILITY_FORMS = 83,
|
||||
SMALL_FORM_VARIANTS = 84,
|
||||
ARABIC_PRESENTATION_FORMS_B = 85,
|
||||
SPECIALS = 86,
|
||||
HALFWIDTH_AND_FULLWIDTH_FORMS = 87;
|
||||
|
||||
static final byte[] blockToScript = {
|
||||
COMMON_SCRIPT, // 0, <RESERVED_BLOCK>
|
||||
LATIN_SCRIPT, // 1, BASIC_LATIN
|
||||
LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT
|
||||
LATIN_SCRIPT, // 3, LATIN_EXTENDED_A
|
||||
LATIN_SCRIPT, // 4, LATIN_EXTENDED_B
|
||||
LATIN_SCRIPT, // 5, IPA_EXTENSIONS
|
||||
COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS
|
||||
COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS
|
||||
GREEK_SCRIPT, // 8, GREEK
|
||||
CYRILLIC_SCRIPT, // 9, CYRILLIC
|
||||
ARMENIAN_SCRIPT, // 10, ARMENIAN
|
||||
HEBREW_SCRIPT, // 11, HEBREW
|
||||
ARABIC_SCRIPT, // 12, ARABIC
|
||||
SYRIAC_SCRIPT, // 13, SYRIAC
|
||||
THAANA_SCRIPT, // 14, THAANA
|
||||
DEVANAGARI_SCRIPT, // 15, DEVANAGARI
|
||||
BENGALI_SCRIPT, // 16, BENGALI
|
||||
GURMUKHI_SCRIPT, // 17, GURMUKHI
|
||||
GUJARATI_SCRIPT, // 18, GUJARATI
|
||||
ORIYA_SCRIPT, // 19, ORIYA
|
||||
TAMIL_SCRIPT, // 20, TAMIL
|
||||
TELUGU_SCRIPT, // 21, TELUGU
|
||||
KANNADA_SCRIPT, // 22, KANNADA
|
||||
MALAYALAM_SCRIPT, // 23, MALAYALAM
|
||||
SINHALA_SCRIPT, // 24, SINHALA
|
||||
THAI_SCRIPT, // 25, THAI
|
||||
LAO_SCRIPT, // 26, LAO
|
||||
TIBETAN_SCRIPT, // 27, TIBETAN
|
||||
MYANMAR_SCRIPT, // 28, MYANMAR
|
||||
GEORGIAN_SCRIPT, // 29, GEORGIAN
|
||||
JAMO_SCRIPT, // 30, HANGUL_JAMO
|
||||
ETHIOPIC_SCRIPT, // 31, ETHIOPIC
|
||||
CHEROKEE_SCRIPT, // 32, CHEROKEE
|
||||
ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
|
||||
OGHAM_SCRIPT, // 34, OGHAM
|
||||
RUNIC_SCRIPT, // 35, RUNIC
|
||||
KHMER_SCRIPT, // 36, KHMER
|
||||
MONGOLIAN_SCRIPT, // 37, MONGOLIAN
|
||||
LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL
|
||||
GREEK_SCRIPT, // 39, GREEK_EXTENDED
|
||||
COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION
|
||||
COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS
|
||||
COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS
|
||||
COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS
|
||||
COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS
|
||||
COMMON_SCRIPT, // 45, NUMBER_FORMS
|
||||
COMMON_SCRIPT, // 46, ARROWS
|
||||
COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS
|
||||
COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL
|
||||
COMMON_SCRIPT, // 49, CONTROL_PICTURES
|
||||
COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION
|
||||
COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS
|
||||
COMMON_SCRIPT, // 52, BOX_DRAWING
|
||||
COMMON_SCRIPT, // 53, BLOCK_ELEMENTS
|
||||
COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES
|
||||
COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS
|
||||
COMMON_SCRIPT, // 56, DINGBATS
|
||||
COMMON_SCRIPT, // 57, BRAILLE_PATTERNS
|
||||
HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT
|
||||
HAN_SCRIPT, // 59, KANGXI_RADICALS
|
||||
HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS
|
||||
COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION
|
||||
HIRAGANA_SCRIPT, // 62, HIRAGANA
|
||||
KATAKANA_SCRIPT, // 63, KATAKANA
|
||||
BOPOMOFO_SCRIPT, // 64, BOPOMOFO
|
||||
JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO
|
||||
HAN_SCRIPT, // 66, KANBUN
|
||||
BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED
|
||||
COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS
|
||||
COMMON_SCRIPT, // 69, CJK_COMPATIBILITY
|
||||
HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|
||||
HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS
|
||||
YI_SCRIPT, // 72, YI_SYLLABLES
|
||||
YI_SCRIPT, // 73, YI_RADICALS
|
||||
HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES
|
||||
COMMON_SCRIPT, // 75, HIGH_SURROGATES
|
||||
COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES
|
||||
COMMON_SCRIPT, // 77, LOW_SURROGATES
|
||||
COMMON_SCRIPT, // 78, PRIVATE_USE
|
||||
HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS
|
||||
COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS
|
||||
ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A
|
||||
COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS
|
||||
COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS
|
||||
COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS
|
||||
ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B
|
||||
COMMON_SCRIPT, // 86, SPECIALS
|
||||
COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS
|
||||
COMMON_SCRIPT, // 88, SPECIALS
|
||||
};
|
||||
|
||||
// could be further reduced to a byte array, but I didn't bother.
|
||||
static final int[][] split = {
|
||||
{0x0250, 4, 5}, // -1
|
||||
{0x02B0, 5, 6}, // -2
|
||||
{0x0370, 7, 8}, // -3
|
||||
{0x0530, 0, 10}, // -4
|
||||
{0x0590, 10, 11}, // -5
|
||||
{0x0750, 13, 0}, // -6
|
||||
{0x07C0, 14, 0}, // -7
|
||||
{0x10A0, 28, 29}, // -8
|
||||
{0x13A0, 0, 32}, // -9
|
||||
{0x16A0, 34, 35}, // -10
|
||||
{0x18B0, 37, 0}, // -11
|
||||
{0x2070, 40, 41}, // -12
|
||||
{0x20A0, 41, -31}, // -13
|
||||
{0x2150, 44, 45}, // -14
|
||||
{0x2190, 45, 46}, // -15
|
||||
{0x2440, 49, -32}, // -16
|
||||
{0x25A0, 53, 54}, // -17
|
||||
{0x27C0, 56, 0}, // -18
|
||||
{0x2FE0, 59, -33}, // -19
|
||||
{0x3040, 61, 62}, // -20
|
||||
{0x30A0, 62, 63}, // -21
|
||||
{0x3130, 64, 65}, // -22
|
||||
{0x3190, 65, -34}, // -23
|
||||
{0x4DB6, 70, 0}, // -24
|
||||
{0xA490, 72, -35}, // -25
|
||||
{0xD7A4, 74, 0}, // -26
|
||||
{0xFB50, 80, 81}, // -27
|
||||
{0xFE20, 0, -36}, // -28
|
||||
{0xFEFF, 85, 86}, // -29
|
||||
{0xFFF0, 87, -37}, // -30
|
||||
{0x20D0, 42, 43}, // -31
|
||||
{0x2460, 50, 51}, // -32
|
||||
{0x2FF0, 0, 60}, // -33
|
||||
{0x31A0, 66, -38}, // -34
|
||||
{0xA4D0, 73, 0}, //-35
|
||||
{0xFE30, 82, -39}, //-36
|
||||
{0xFFFE, 88, 0}, //-37
|
||||
{0x31C0, 67, 0}, // -38
|
||||
{0xFE50, 83, -40}, //-39
|
||||
{0xFE70, 84, 85} // -40
|
||||
};
|
||||
|
||||
static final byte[] charToBlock = {
|
||||
1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7,
|
||||
0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27,
|
||||
28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36,
|
||||
37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39,
|
||||
-12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18,
|
||||
57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19,
|
||||
-20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70,
|
||||
70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
|
||||
70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
|
||||
70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
|
||||
72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74,
|
||||
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
|
||||
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
|
||||
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
|
||||
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
|
||||
74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26,
|
||||
75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77,
|
||||
78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
|
||||
78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
|
||||
78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
|
||||
78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30
|
||||
};
|
||||
|
||||
}
|
Loading…
Add table
Reference in a new issue