mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 15:05:53 +00:00
ICU-1946 checked in tests for Arabic, Hebrew, Thai.
There are still a few niggling bugs left. X-SVN-Rev: 9150
This commit is contained in:
parent
c56bb42770
commit
28aa343a73
5 changed files with 189 additions and 49 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/demo/translit/Demo.java,v $
|
||||
* $Date: 2002/07/14 22:03:24 $
|
||||
* $Revision: 1.20 $
|
||||
* $Date: 2002/07/15 01:26:18 $
|
||||
* $Revision: 1.21 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -31,7 +31,7 @@ import java.io.*;
|
|||
* <p>Copyright (c) IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: Demo.java,v $ $Revision: 1.20 $ $Date: 2002/07/14 22:03:24 $
|
||||
* @version $RCSfile: Demo.java,v $ $Revision: 1.21 $ $Date: 2002/07/15 01:26:18 $
|
||||
*/
|
||||
public class Demo extends Frame {
|
||||
|
||||
|
@ -428,6 +428,7 @@ public class Demo extends Frame {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
boolean transliterateTyping = true;
|
||||
Transliterator fromHex = Transliterator.getInstance("Hex-Any");
|
||||
InfoDialog helpDialog;
|
||||
|
@ -625,11 +626,30 @@ public class Demo extends Frame {
|
|||
first = false;
|
||||
}
|
||||
}
|
||||
int dashPos = id.indexOf('-');
|
||||
int slashPos = id.indexOf('/');
|
||||
if (slashPos < 0) slashPos = id.length();
|
||||
UnicodeSet sourceSuper = null;
|
||||
try {
|
||||
sourceSuper = new UnicodeSet("[:" + id.substring(0,dashPos) + ":]");
|
||||
} catch (Exception e) {}
|
||||
|
||||
UnicodeSet targetSuper = null;
|
||||
try {
|
||||
targetSuper = new UnicodeSet("[:" + id.substring(dashPos+1, slashPos) + ":]");
|
||||
} catch (Exception e) {}
|
||||
|
||||
out.println("</table><ul>");
|
||||
out.println("<li>Source Set:<ul><li>" + translit.getSourceSet().toPattern(true) + "</li></ul></li>");
|
||||
out.println("<li>Reverse Target Set:<ul><li>" + lt.getTargetSet().toPattern(true) + "</li></ul></li>");
|
||||
out.println("<li>Target Set:<ul><li>" + translit.getTargetSet().toPattern(true) + "</li></ul></li>");
|
||||
out.println("<li>Reverse Source Set:<ul><li>" + lt.getSourceSet().toPattern(true) + "</li></ul></li>");
|
||||
out.println("<p><b>NFD</b></p>");
|
||||
out.println("<li>Source Set:<ul><li>" + toPattern(closeUnicodeSet(translit.getSourceSet(), Normalizer.NFD, true), sourceSuper) + "</li></ul></li>");
|
||||
out.println("<li>Reverse Target Set:<ul><li>" + toPattern(closeUnicodeSet(lt.getTargetSet(), Normalizer.NFD, true), sourceSuper) + "</li></ul></li>");
|
||||
out.println("<li>Target Set:<ul><li>" + toPattern(closeUnicodeSet(translit.getTargetSet(), Normalizer.NFD, true), targetSuper) + "</li></ul></li>");
|
||||
out.println("<li>Reverse Source Set:<ul><li>" + toPattern(closeUnicodeSet(lt.getSourceSet(), Normalizer.NFD, true), targetSuper) + "</li></ul></li>");
|
||||
out.println("<p><b>NFKD</b></p>");
|
||||
out.println("<li>Source Set:<ul><li>" + toPattern(closeUnicodeSet(translit.getSourceSet(), Normalizer.NFKD, true), sourceSuper) + "</li></ul></li>");
|
||||
out.println("<li>Reverse Target Set:<ul><li>" + toPattern(closeUnicodeSet(lt.getTargetSet(), Normalizer.NFKD, true), sourceSuper) + "</li></ul></li>");
|
||||
out.println("<li>Target Set:<ul><li>" + toPattern(closeUnicodeSet(translit.getTargetSet(), Normalizer.NFKD, true), targetSuper) + "</li></ul></li>");
|
||||
out.println("<li>Reverse Source Set:<ul><li>" + toPattern(closeUnicodeSet(lt.getSourceSet(), Normalizer.NFKD, true), targetSuper) + "</li></ul></li>");
|
||||
out.println("</ul></body>");
|
||||
out.close();
|
||||
System.out.println("Done Writing");
|
||||
|
@ -638,6 +658,62 @@ public class Demo extends Frame {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static UnicodeSet closeUnicodeSet(UnicodeSet source, Normalizer.Mode mode, boolean caseToo) {
|
||||
UnicodeSetIterator it = new UnicodeSetIterator(source);
|
||||
UnicodeSet additions = new UnicodeSet(); // to avoid messing up iterator
|
||||
int cp;
|
||||
|
||||
// First add all case equivalents
|
||||
if (caseToo) {
|
||||
while (it.next()) {
|
||||
cp = it.codepoint;
|
||||
if (cp == it.IS_STRING) continue;
|
||||
int type = UCharacter.getType(cp);
|
||||
if (type == Character.UPPERCASE_LETTER || type == Character.LOWERCASE_LETTER || type == Character.TITLECASE_LETTER) {
|
||||
additions.add(UCharacter.toLowerCase(UTF16.valueOf(cp)));
|
||||
additions.add(UCharacter.toUpperCase(UTF16.valueOf(cp)));
|
||||
}
|
||||
}
|
||||
source.addAll(additions);
|
||||
additions.clear();
|
||||
}
|
||||
|
||||
// Now add all decompositions of characters in source
|
||||
it.reset(source);
|
||||
while (it.next()) {
|
||||
cp = it.codepoint;
|
||||
if (cp == it.IS_STRING) continue;
|
||||
if (Normalizer.isNormalized(cp, mode)) continue;
|
||||
String decomp = Normalizer.normalize(cp, mode);
|
||||
additions.add(decomp);
|
||||
}
|
||||
source.addAll(additions);
|
||||
|
||||
// Now add any other character that decomposes to a character in source
|
||||
for (cp = 0; cp < 0x10FFFF; ++cp) {
|
||||
if (!UCharacter.isDefined(cp)) continue;
|
||||
if (Normalizer.isNormalized(cp, mode)) continue;
|
||||
if (source.contains(cp)) continue;
|
||||
|
||||
String decomp = Normalizer.normalize(cp, mode);
|
||||
if (source.containsAll(decomp)) {
|
||||
System.out.println("Adding: " + Integer.toString(cp,16) + " " + UCharacter.getName(cp));
|
||||
source.add(cp);
|
||||
}
|
||||
}
|
||||
// For completeness, later we should add the canonical closure of all strings in source
|
||||
return source;
|
||||
}
|
||||
|
||||
static String toPattern(UnicodeSet source, UnicodeSet superset) {
|
||||
if (superset != null) {
|
||||
source.removeAll(superset);
|
||||
return "[" + superset.toPattern(true) + " " + source.toPattern(true) + "]";
|
||||
}
|
||||
return source.toPattern(true);
|
||||
}
|
||||
|
||||
static BreakIterator bi = BreakIterator.getWordInstance();
|
||||
|
||||
static String titlecaseFirstWord(String line) {
|
||||
|
|
|
@ -132,6 +132,23 @@ public class RoundTripTest extends TestFmwk {
|
|||
.test("[a-zA-Z\u0110\u0111]", "[\u0400-\u045F]", null, this, new Legal());
|
||||
}
|
||||
|
||||
static final String ARABIC = "[\u060C\u061B\u061F\u0621\u0627-\u063A\u0641-\u0655\u0660-\u066C\u067E\u0686\u0698\u06A4\u06AD\u06AF\u06CB-\u06CC\u06F0-\u06F9]";
|
||||
|
||||
public void TestArabic() throws IOException, ParseException {
|
||||
new Test("Latin-Arabic")
|
||||
.test("[a-zA-Z\u02BE\u02BF\u207F]", ARABIC, null, this, new Legal());
|
||||
}
|
||||
|
||||
public void TestHebrew() throws IOException, ParseException {
|
||||
new Test("Latin-Hebrew")
|
||||
.test("[a-zA-Z\u02BC\u02BB]", "[[:hebrew:]-[\uFB00-\uFBFF]]", "[\u05F0\u05F1\u05F2]", this, new LegalHebrew());
|
||||
}
|
||||
|
||||
public void TestThai() throws IOException, ParseException {
|
||||
new Test("Latin-Thai")
|
||||
.test("[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268]", "[:thai:]", null, this, new LegalThai());
|
||||
}
|
||||
|
||||
//----------------------------------
|
||||
// Inter-Indic Tests
|
||||
//----------------------------------
|
||||
|
@ -615,6 +632,38 @@ public class RoundTripTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
// anything is legal except word ending with Logical-order-exception
|
||||
public static class LegalThai extends Legal {
|
||||
public boolean is(String sourceString) {
|
||||
if (sourceString.length() == 0) return true;
|
||||
char ch = sourceString.charAt(sourceString.length() - 1); // don't worry about surrogates.
|
||||
if (UCharacter.hasBinaryProperty(ch, UProperty.LOGICAL_ORDER_EXCEPTION)) return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// anything is legal except that Final letters can't be followed by letter; NonFinal must be
|
||||
public static class LegalHebrew extends Legal {
|
||||
static UnicodeSet FINAL = new UnicodeSet("[\u05DA\u05DD\u05DF\u05E3\u05E5]");
|
||||
static UnicodeSet NON_FINAL = new UnicodeSet("[\u05DB\u05DE\u05E0\u05E4\u05E6]");
|
||||
static UnicodeSet LETTER = new UnicodeSet("[:letter:]");
|
||||
public boolean is(String sourceString) {
|
||||
if (sourceString.length() == 0) return true;
|
||||
// don't worry about surrogates.
|
||||
for (int i = 0; i < sourceString.length(); ++i) {
|
||||
char ch = sourceString.charAt(i);
|
||||
char next = i+1 == sourceString.length() ? '\u0000' : sourceString.charAt(i);
|
||||
if (FINAL.contains(ch)) {
|
||||
if (LETTER.contains(next)) return false;
|
||||
} else if (NON_FINAL.contains(ch)) {
|
||||
if (!LETTER.contains(next)) return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static class LegalGreek extends Legal {
|
||||
|
||||
boolean full;
|
||||
|
@ -867,7 +916,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
String irrelevants = "\u2000\u2001\u2126\u212A\u212B\u2329"; // string is from NFC_NO in the UCD
|
||||
|
||||
if (!checkIrrelevants(sourceToTarget, irrelevants)) {
|
||||
logFails("Source-Target, irrelevants");
|
||||
logFails("Source-Target, Must not NFC everything");
|
||||
}
|
||||
if (!checkIrrelevants(targetToSource, irrelevants)) {
|
||||
logFails("Target-Source, irrelevants");
|
||||
|
@ -1130,10 +1179,10 @@ public class RoundTripTest extends TestFmwk {
|
|||
if (++errorCount > errorLimit) {
|
||||
throw new TestTruncated("Test truncated; too many failures");
|
||||
}
|
||||
out.println("<br>Fail " + label + ": " +
|
||||
from + " (" +
|
||||
TestUtility.hex(from) + ") => " +
|
||||
to + " (" +
|
||||
out.println("<br>Fail " + label + ": \u200E" +
|
||||
from + "\u200E (" +
|
||||
TestUtility.hex(from) + ") => \u200E" +
|
||||
to + "\u200E (" +
|
||||
TestUtility.hex(to) + ")"
|
||||
);
|
||||
}
|
||||
|
@ -1142,15 +1191,15 @@ public class RoundTripTest extends TestFmwk {
|
|||
if (++errorCount > errorLimit) {
|
||||
throw new TestTruncated("Test truncated; too many failures");
|
||||
}
|
||||
out.println("<br>Fail (can.equiv)" + label + ": " +
|
||||
from + " (" +
|
||||
TestUtility.hex(from) + ") => " +
|
||||
to + " (" +
|
||||
out.println("<br>Fail (can.equiv) " + label + ": \u200E" +
|
||||
from + "\u200E (" +
|
||||
TestUtility.hex(from) + ") => \u200E" +
|
||||
to + "\u200E (" +
|
||||
TestUtility.hex(to) + ")" +
|
||||
" -- " +
|
||||
fromCan + " (" +
|
||||
TestUtility.hex(fromCan) + ") => " +
|
||||
toCan + " (" +
|
||||
" -- \u200E" +
|
||||
fromCan + "\u200E (" +
|
||||
TestUtility.hex(fromCan) + ") => \u200E" +
|
||||
toCan + "\u200E (" +
|
||||
TestUtility.hex(toCan) + ")"
|
||||
);
|
||||
}
|
||||
|
@ -1166,12 +1215,12 @@ public class RoundTripTest extends TestFmwk {
|
|||
if (++errorCount > errorLimit) {
|
||||
throw new TestTruncated("Test truncated; too many failures");
|
||||
}
|
||||
out.println("<br>Fail (can.equiv)" + label + ": " +
|
||||
from + " (" +
|
||||
TestUtility.hex(from) + ") => " +
|
||||
to + " (" +
|
||||
TestUtility.hex(to) + ")" +
|
||||
toCan + " (" +
|
||||
out.println("<br>Fail (can.equiv) " + label + ": \u200E" +
|
||||
from + "\u200E (" +
|
||||
TestUtility.hex(from) + ") => \u200E" +
|
||||
to + "\u200E (" +
|
||||
TestUtility.hex(to) + ")\u200E" +
|
||||
toCan + "\u200E (" +
|
||||
TestUtility.hex(toCan) + ")"
|
||||
);
|
||||
}
|
||||
|
@ -1182,12 +1231,12 @@ public class RoundTripTest extends TestFmwk {
|
|||
if (++errorCount > errorLimit) {
|
||||
throw new TestTruncated("Test truncated; too many failures");
|
||||
}
|
||||
out.println("<br>Fail Roundtrip: " +
|
||||
from + " (" +
|
||||
TestUtility.hex(from) + ") "+toID+"=> " +
|
||||
to + " (" +
|
||||
TestUtility.hex(to) + ") " + backID+"=> " +
|
||||
back + " (" +
|
||||
out.println("<br>Fail Roundtrip: \u200E" +
|
||||
from + "\u200E (" +
|
||||
TestUtility.hex(from) + ") "+toID+"=> \u200E" +
|
||||
to + "\u200E (" +
|
||||
TestUtility.hex(to) + ") " + backID+"=> \u200E" +
|
||||
back + "\u200E (" +
|
||||
TestUtility.hex(back) + ")"
|
||||
);
|
||||
}
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_Arabic_Latin.txt,v $
|
||||
# $Date: 2002/07/14 22:02:01 $
|
||||
# $Revision: 1.1 $
|
||||
# $Date: 2002/07/15 01:26:18 $
|
||||
# $Revision: 1.2 $
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
# Generally follows UNGEGN <http://www.eki.ee/wgrs/rom1_ar.pdf>
|
||||
|
@ -18,8 +18,8 @@
|
|||
# While it could be done, we need to determine whether a prefix "al" could
|
||||
# occur other than as the definite article (since no space is used).
|
||||
|
||||
:: NFD (NFC);
|
||||
:: lower () ;
|
||||
:: [[:Arabic:] [ⁿ\u060C\u061B\u061F\u0640\u064B-\u0655\u0660-\u066C\u06F0-\u06F9\uFDFC]] ;
|
||||
:: NFKD (NFC);
|
||||
$disambig = ̱ ;
|
||||
$disambig2 = ̰ ;
|
||||
$under = ̣ ;
|
||||
|
@ -74,6 +74,10 @@ $under = ̣ ;
|
|||
ظ <> z $under ; # ARABIC LETTER ZAH
|
||||
غ <> g h $disambig ; # ARABIC LETTER GHAIN
|
||||
|
||||
# WARNING: special case
|
||||
# These canonically rearrange, so we have to special-case the return
|
||||
ةٕ < ẗ̹ ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS
|
||||
|
||||
# non-Arabic language
|
||||
ژ <> z h $disambig ; # ARABIC LETTER JEH
|
||||
ڭ <> n $disambig g ; # ARABIC LETTER NG
|
||||
|
@ -129,9 +133,13 @@ $under = ̣ ;
|
|||
گ <> g ; # ARABIC LETTER GAF
|
||||
|
||||
# fallbacks
|
||||
|
||||
| s < c } [eiy];
|
||||
| k < c ;
|
||||
| i < e ;
|
||||
| u < o ;
|
||||
| ks < x ;
|
||||
| n < ⁿ;
|
||||
|
||||
::NFC (NFD);
|
||||
:: (lower) ;
|
||||
::NFC (NFD);
|
||||
:: ( [[:Latin:] [%,.0-9;?\u02BE-\u02BF\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339\u037E]] );
|
|
@ -3,8 +3,8 @@
|
|||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_Hebrew_Latin.txt,v $
|
||||
# $Date: 2002/07/14 21:59:17 $
|
||||
# $Revision: 1.2 $
|
||||
# $Date: 2002/07/15 01:26:18 $
|
||||
# $Revision: 1.3 $
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
# Transliteration table for Hebrew
|
||||
|
@ -27,8 +27,8 @@
|
|||
# For more information, see"
|
||||
# http://oss.software.ibm.com/icu/userguide/Transliteration.html
|
||||
|
||||
:: nfd (nfc) ;
|
||||
:: (lower);
|
||||
:: [[:Hebrew:] [\u05B0-\u05B9\u05BB-\u05BC\u05C1-\u05C2\u2135-\u2138]] ;
|
||||
:: nfkd (nfc) ;
|
||||
$letterAfter = [:M:]* [:L:] ;
|
||||
|
||||
# move longer items here to avoid masking
|
||||
|
@ -62,6 +62,11 @@ $letterAfter = [:M:]* [:L:] ;
|
|||
ק <> q ;
|
||||
ר <> r ;
|
||||
|
||||
װ > | וו; # HEBREW LIGATURE YIDDISH DOUBLE VAV
|
||||
ױ > | וי; # HEBREW LIGATURE YIDDISH VAV YOD
|
||||
ײ > | יי ; # HEBREW LIGATURE YIDDISH DOUBLE YOD
|
||||
|
||||
|
||||
ּ <> ̇ ; # dagesh just goes to overdot for now
|
||||
ׁ <> ̌ ; # shin dot -> sh
|
||||
ׂ <> ̂ ; # sin dot -> s
|
||||
|
@ -88,4 +93,6 @@ $letterAfter = [:M:]* [:L:] ;
|
|||
ו < v ;
|
||||
כס < x ;
|
||||
|
||||
:: nfc (nfd) ;
|
||||
:: (lower);
|
||||
:: nfc (nfd) ;
|
||||
:: ([[:Latin:] [\u02BB-\u02BC\u0300-\u0302\u0307\u030C\u0327\u0331\u0340-\u0341]]);
|
|
@ -3,8 +3,8 @@
|
|||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_ThaiLogical_Latin.txt,v $
|
||||
# $Date: 2002/07/13 03:36:59 $
|
||||
# $Revision: 1.2 $
|
||||
# $Date: 2002/07/15 01:26:18 $
|
||||
# $Revision: 1.3 $
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
# Thai-Latin
|
||||
|
@ -23,11 +23,11 @@
|
|||
|
||||
# insert implicit vowel (and remove it going the other way)
|
||||
|
||||
$consonant = [\u0E01-\u0E2E];
|
||||
$vowel = [\u0E30-\u0E3A\u0E40-\u0E44\u0E47];
|
||||
$consonant = [ก-ฮ];
|
||||
$vowel = [ะ-ฺเ-ไ็];
|
||||
|
||||
{ ( $consonant ) } [^$vowel \uE000] > | $1 \uE000 ;
|
||||
\uE000 > ọ ;
|
||||
{ ( $consonant ) } [^$vowel ] > | $1 ;
|
||||
> ọ ;
|
||||
< ọ ;
|
||||
|
||||
# Consonants
|
||||
|
|
Loading…
Add table
Reference in a new issue