ICU-1946 checked in tests for Arabic, Hebrew, Thai.

There are still a few niggling bugs left.

X-SVN-Rev: 9150
This commit is contained in:
Mark Davis 2002-07-15 01:26:18 +00:00
parent c56bb42770
commit 28aa343a73
5 changed files with 189 additions and 49 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/demo/translit/Demo.java,v $
* $Date: 2002/07/14 22:03:24 $
* $Revision: 1.20 $
* $Date: 2002/07/15 01:26:18 $
* $Revision: 1.21 $
*
*****************************************************************************************
*/
@ -31,7 +31,7 @@ import java.io.*;
* <p>Copyright (c) IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @version $RCSfile: Demo.java,v $ $Revision: 1.20 $ $Date: 2002/07/14 22:03:24 $
* @version $RCSfile: Demo.java,v $ $Revision: 1.21 $ $Date: 2002/07/15 01:26:18 $
*/
public class Demo extends Frame {
@ -428,6 +428,7 @@ public class Demo extends Frame {
}
}
boolean transliterateTyping = true;
Transliterator fromHex = Transliterator.getInstance("Hex-Any");
InfoDialog helpDialog;
@ -625,11 +626,30 @@ public class Demo extends Frame {
first = false;
}
}
int dashPos = id.indexOf('-');
int slashPos = id.indexOf('/');
if (slashPos < 0) slashPos = id.length();
UnicodeSet sourceSuper = null;
try {
sourceSuper = new UnicodeSet("[:" + id.substring(0,dashPos) + ":]");
} catch (Exception e) {}
UnicodeSet targetSuper = null;
try {
targetSuper = new UnicodeSet("[:" + id.substring(dashPos+1, slashPos) + ":]");
} catch (Exception e) {}
out.println("</table><ul>");
out.println("<li>Source Set:<ul><li>" + translit.getSourceSet().toPattern(true) + "</li></ul></li>");
out.println("<li>Reverse Target Set:<ul><li>" + lt.getTargetSet().toPattern(true) + "</li></ul></li>");
out.println("<li>Target Set:<ul><li>" + translit.getTargetSet().toPattern(true) + "</li></ul></li>");
out.println("<li>Reverse Source Set:<ul><li>" + lt.getSourceSet().toPattern(true) + "</li></ul></li>");
out.println("<p><b>NFD</b></p>");
out.println("<li>Source Set:<ul><li>" + toPattern(closeUnicodeSet(translit.getSourceSet(), Normalizer.NFD, true), sourceSuper) + "</li></ul></li>");
out.println("<li>Reverse Target Set:<ul><li>" + toPattern(closeUnicodeSet(lt.getTargetSet(), Normalizer.NFD, true), sourceSuper) + "</li></ul></li>");
out.println("<li>Target Set:<ul><li>" + toPattern(closeUnicodeSet(translit.getTargetSet(), Normalizer.NFD, true), targetSuper) + "</li></ul></li>");
out.println("<li>Reverse Source Set:<ul><li>" + toPattern(closeUnicodeSet(lt.getSourceSet(), Normalizer.NFD, true), targetSuper) + "</li></ul></li>");
out.println("<p><b>NFKD</b></p>");
out.println("<li>Source Set:<ul><li>" + toPattern(closeUnicodeSet(translit.getSourceSet(), Normalizer.NFKD, true), sourceSuper) + "</li></ul></li>");
out.println("<li>Reverse Target Set:<ul><li>" + toPattern(closeUnicodeSet(lt.getTargetSet(), Normalizer.NFKD, true), sourceSuper) + "</li></ul></li>");
out.println("<li>Target Set:<ul><li>" + toPattern(closeUnicodeSet(translit.getTargetSet(), Normalizer.NFKD, true), targetSuper) + "</li></ul></li>");
out.println("<li>Reverse Source Set:<ul><li>" + toPattern(closeUnicodeSet(lt.getSourceSet(), Normalizer.NFKD, true), targetSuper) + "</li></ul></li>");
out.println("</ul></body>");
out.close();
System.out.println("Done Writing");
@ -638,6 +658,62 @@ public class Demo extends Frame {
}
}
static UnicodeSet closeUnicodeSet(UnicodeSet source, Normalizer.Mode mode, boolean caseToo) {
UnicodeSetIterator it = new UnicodeSetIterator(source);
UnicodeSet additions = new UnicodeSet(); // to avoid messing up iterator
int cp;
// First add all case equivalents
if (caseToo) {
while (it.next()) {
cp = it.codepoint;
if (cp == it.IS_STRING) continue;
int type = UCharacter.getType(cp);
if (type == Character.UPPERCASE_LETTER || type == Character.LOWERCASE_LETTER || type == Character.TITLECASE_LETTER) {
additions.add(UCharacter.toLowerCase(UTF16.valueOf(cp)));
additions.add(UCharacter.toUpperCase(UTF16.valueOf(cp)));
}
}
source.addAll(additions);
additions.clear();
}
// Now add all decompositions of characters in source
it.reset(source);
while (it.next()) {
cp = it.codepoint;
if (cp == it.IS_STRING) continue;
if (Normalizer.isNormalized(cp, mode)) continue;
String decomp = Normalizer.normalize(cp, mode);
additions.add(decomp);
}
source.addAll(additions);
// Now add any other character that decomposes to a character in source
for (cp = 0; cp < 0x10FFFF; ++cp) {
if (!UCharacter.isDefined(cp)) continue;
if (Normalizer.isNormalized(cp, mode)) continue;
if (source.contains(cp)) continue;
String decomp = Normalizer.normalize(cp, mode);
if (source.containsAll(decomp)) {
System.out.println("Adding: " + Integer.toString(cp,16) + " " + UCharacter.getName(cp));
source.add(cp);
}
}
// For completeness, later we should add the canonical closure of all strings in source
return source;
}
static String toPattern(UnicodeSet source, UnicodeSet superset) {
if (superset != null) {
source.removeAll(superset);
return "[" + superset.toPattern(true) + " " + source.toPattern(true) + "]";
}
return source.toPattern(true);
}
static BreakIterator bi = BreakIterator.getWordInstance();
static String titlecaseFirstWord(String line) {

View file

@ -132,6 +132,23 @@ public class RoundTripTest extends TestFmwk {
.test("[a-zA-Z\u0110\u0111]", "[\u0400-\u045F]", null, this, new Legal());
}
static final String ARABIC = "[\u060C\u061B\u061F\u0621\u0627-\u063A\u0641-\u0655\u0660-\u066C\u067E\u0686\u0698\u06A4\u06AD\u06AF\u06CB-\u06CC\u06F0-\u06F9]";
public void TestArabic() throws IOException, ParseException {
new Test("Latin-Arabic")
.test("[a-zA-Z\u02BE\u02BF\u207F]", ARABIC, null, this, new Legal());
}
public void TestHebrew() throws IOException, ParseException {
new Test("Latin-Hebrew")
.test("[a-zA-Z\u02BC\u02BB]", "[[:hebrew:]-[\uFB00-\uFBFF]]", "[\u05F0\u05F1\u05F2]", this, new LegalHebrew());
}
public void TestThai() throws IOException, ParseException {
new Test("Latin-Thai")
.test("[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268]", "[:thai:]", null, this, new LegalThai());
}
//----------------------------------
// Inter-Indic Tests
//----------------------------------
@ -615,6 +632,38 @@ public class RoundTripTest extends TestFmwk {
}
}
// anything is legal except word ending with Logical-order-exception
public static class LegalThai extends Legal {
public boolean is(String sourceString) {
if (sourceString.length() == 0) return true;
char ch = sourceString.charAt(sourceString.length() - 1); // don't worry about surrogates.
if (UCharacter.hasBinaryProperty(ch, UProperty.LOGICAL_ORDER_EXCEPTION)) return false;
return true;
}
}
// anything is legal except that Final letters can't be followed by letter; NonFinal must be
public static class LegalHebrew extends Legal {
static UnicodeSet FINAL = new UnicodeSet("[\u05DA\u05DD\u05DF\u05E3\u05E5]");
static UnicodeSet NON_FINAL = new UnicodeSet("[\u05DB\u05DE\u05E0\u05E4\u05E6]");
static UnicodeSet LETTER = new UnicodeSet("[:letter:]");
public boolean is(String sourceString) {
if (sourceString.length() == 0) return true;
// don't worry about surrogates.
for (int i = 0; i < sourceString.length(); ++i) {
char ch = sourceString.charAt(i);
char next = i+1 == sourceString.length() ? '\u0000' : sourceString.charAt(i);
if (FINAL.contains(ch)) {
if (LETTER.contains(next)) return false;
} else if (NON_FINAL.contains(ch)) {
if (!LETTER.contains(next)) return false;
}
}
return true;
}
}
public static class LegalGreek extends Legal {
boolean full;
@ -867,7 +916,7 @@ public class RoundTripTest extends TestFmwk {
String irrelevants = "\u2000\u2001\u2126\u212A\u212B\u2329"; // string is from NFC_NO in the UCD
if (!checkIrrelevants(sourceToTarget, irrelevants)) {
logFails("Source-Target, irrelevants");
logFails("Source-Target, Must not NFC everything");
}
if (!checkIrrelevants(targetToSource, irrelevants)) {
logFails("Target-Source, irrelevants");
@ -1130,10 +1179,10 @@ public class RoundTripTest extends TestFmwk {
if (++errorCount > errorLimit) {
throw new TestTruncated("Test truncated; too many failures");
}
out.println("<br>Fail " + label + ": " +
from + " (" +
TestUtility.hex(from) + ") => " +
to + " (" +
out.println("<br>Fail " + label + ": \u200E" +
from + "\u200E (" +
TestUtility.hex(from) + ") => \u200E" +
to + "\u200E (" +
TestUtility.hex(to) + ")"
);
}
@ -1142,15 +1191,15 @@ public class RoundTripTest extends TestFmwk {
if (++errorCount > errorLimit) {
throw new TestTruncated("Test truncated; too many failures");
}
out.println("<br>Fail (can.equiv)" + label + ": " +
from + " (" +
TestUtility.hex(from) + ") => " +
to + " (" +
out.println("<br>Fail (can.equiv) " + label + ": \u200E" +
from + "\u200E (" +
TestUtility.hex(from) + ") => \u200E" +
to + "\u200E (" +
TestUtility.hex(to) + ")" +
" -- " +
fromCan + " (" +
TestUtility.hex(fromCan) + ") => " +
toCan + " (" +
" -- \u200E" +
fromCan + "\u200E (" +
TestUtility.hex(fromCan) + ") => \u200E" +
toCan + "\u200E (" +
TestUtility.hex(toCan) + ")"
);
}
@ -1166,12 +1215,12 @@ public class RoundTripTest extends TestFmwk {
if (++errorCount > errorLimit) {
throw new TestTruncated("Test truncated; too many failures");
}
out.println("<br>Fail (can.equiv)" + label + ": " +
from + " (" +
TestUtility.hex(from) + ") => " +
to + " (" +
TestUtility.hex(to) + ")" +
toCan + " (" +
out.println("<br>Fail (can.equiv) " + label + ": \u200E" +
from + "\u200E (" +
TestUtility.hex(from) + ") => \u200E" +
to + "\u200E (" +
TestUtility.hex(to) + ")\u200E" +
toCan + "\u200E (" +
TestUtility.hex(toCan) + ")"
);
}
@ -1182,12 +1231,12 @@ public class RoundTripTest extends TestFmwk {
if (++errorCount > errorLimit) {
throw new TestTruncated("Test truncated; too many failures");
}
out.println("<br>Fail Roundtrip: " +
from + " (" +
TestUtility.hex(from) + ") "+toID+"=> " +
to + " (" +
TestUtility.hex(to) + ") " + backID+"=> " +
back + " (" +
out.println("<br>Fail Roundtrip: \u200E" +
from + "\u200E (" +
TestUtility.hex(from) + ") "+toID+"=> \u200E" +
to + "\u200E (" +
TestUtility.hex(to) + ") " + backID+"=> \u200E" +
back + "\u200E (" +
TestUtility.hex(back) + ")"
);
}

View file

@ -3,8 +3,8 @@
# Corporation and others. All Rights Reserved.
#--------------------------------------------------------------------
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_Arabic_Latin.txt,v $
# $Date: 2002/07/14 22:02:01 $
# $Revision: 1.1 $
# $Date: 2002/07/15 01:26:18 $
# $Revision: 1.2 $
#--------------------------------------------------------------------
# Generally follows UNGEGN <http://www.eki.ee/wgrs/rom1_ar.pdf>
@ -18,8 +18,8 @@
# While it could be done, we need to determine whether a prefix "al" could
# occur other than as the definite article (since no space is used).
:: NFD (NFC);
:: lower () ;
:: [[:Arabic:] [‎ⁿ\u060C\u061B\u061F\u0640\u064B-\u0655\u0660-\u066C\u06F0-\u06F9\uFDFC]] ;
:: NFKD (NFC);
$disambig = ̱ ;
$disambig2 = ̰ ;
$under = ̣ ;
@ -74,6 +74,10 @@ $under = ̣ ;
ظ <> z $under ; # ARABIC LETTER ZAH
غ <> g h $disambig ; # ARABIC LETTER GHAIN
# WARNING: special case
# These canonically rearrange, so we have to special-case the return
ةٕ < ẗ̹ ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS
# non-Arabic language
ژ <> z h $disambig ; # ARABIC LETTER JEH
ڭ <> n $disambig g ; # ARABIC LETTER NG
@ -129,9 +133,13 @@ $under = ̣ ;
گ <> g ; # ARABIC LETTER GAF
# fallbacks
| s < c } [eiy];
| k < c ;
| i < e ;
| u < o ;
| ks < x ;
| n < ‎ⁿ;
::NFC (NFD);
:: (lower) ;
::NFC (NFD);
:: ( [[:Latin:] [%,.0-9;?\u02BE-\u02BF\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339\u037E]] );

View file

@ -3,8 +3,8 @@
# Corporation and others. All Rights Reserved.
#--------------------------------------------------------------------
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_Hebrew_Latin.txt,v $
# $Date: 2002/07/14 21:59:17 $
# $Revision: 1.2 $
# $Date: 2002/07/15 01:26:18 $
# $Revision: 1.3 $
#--------------------------------------------------------------------
# Transliteration table for Hebrew
@ -27,8 +27,8 @@
# For more information, see"
# http://oss.software.ibm.com/icu/userguide/Transliteration.html
:: nfd (nfc) ;
:: (lower);
:: [[:Hebrew:] [\u05B0-\u05B9\u05BB-\u05BC\u05C1-\u05C2\u2135-\u2138]] ;
:: nfkd (nfc) ;
$letterAfter = [:M:]* [:L:] ;
# move longer items here to avoid masking
@ -62,6 +62,11 @@ $letterAfter = [:M:]* [:L:] ;
ק <> q ;
ר <> r ;
װ > | וו; # HEBREW LIGATURE YIDDISH DOUBLE VAV
ױ > | וי; # HEBREW LIGATURE YIDDISH VAV YOD
ײ > | יי ; # HEBREW LIGATURE YIDDISH DOUBLE YOD
ּ <> ̇ ; # dagesh just goes to overdot for now
ׁ <> ̌ ; # shin dot -> sh
ׂ <> ̂ ; # sin dot -> s
@ -88,4 +93,6 @@ $letterAfter = [:M:]* [:L:] ;
ו < v ;
כס < x ;
:: nfc (nfd) ;
:: (lower);
:: nfc (nfd) ;
:: ([[:Latin:] [\u02BB-\u02BC\u0300-\u0302\u0307\u030C\u0327\u0331\u0340-\u0341]]);

View file

@ -3,8 +3,8 @@
# Corporation and others. All Rights Reserved.
#--------------------------------------------------------------------
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_ThaiLogical_Latin.txt,v $
# $Date: 2002/07/13 03:36:59 $
# $Revision: 1.2 $
# $Date: 2002/07/15 01:26:18 $
# $Revision: 1.3 $
#--------------------------------------------------------------------
# Thai-Latin
@ -23,11 +23,11 @@
# insert implicit vowel (and remove it going the other way)
$consonant = [\u0E01-\u0E2E];
$vowel = [\u0E30-\u0E3A\u0E40-\u0E44\u0E47];
$consonant = [ก-ฮ];
$vowel = [ะ-ฺเ-ไ็];
{ ( $consonant ) } [^$vowel \uE000] > | $1 \uE000 ;
\uE000 > ọ ;
{ ( $consonant ) } [^$vowel ] > | $1  ;
> ọ ;
< ọ ;
# Consonants