ICU-1946 checked in tests for Arabic, Hebrew, Thai.

There are still a few niggling bugs left. X-SVN-Rev: 9150
2025-04-08 15:05:53 +00:00 · 2002-07-15 01:26:18 +00:00 · 2002-07-15 01:26:18 +00:00 · 28aa343a73
commit 28aa343a73
parent c56bb42770
5 changed files with 189 additions and 49 deletions
--- a/icu4j/src/com/ibm/icu/dev/demo/translit/Demo.java
+++ b/icu4j/src/com/ibm/icu/dev/demo/translit/Demo.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/demo/translit/Demo.java,v $ 
- * $Date: 2002/07/14 22:03:24 $ 
- * $Revision: 1.20 $
+ * $Date: 2002/07/15 01:26:18 $ 
+ * $Revision: 1.21 $
 *
 *****************************************************************************************
 */
@ -31,7 +31,7 @@ import java.io.*;
 * <p>Copyright (c) IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: Demo.java,v $ $Revision: 1.20 $ $Date: 2002/07/14 22:03:24 $
+ * @version $RCSfile: Demo.java,v $ $Revision: 1.21 $ $Date: 2002/07/15 01:26:18 $
 */
 public class Demo extends Frame {

@ -428,6 +428,7 @@ public class Demo extends Frame {
        }
    }
    
+
    boolean transliterateTyping = true;
    Transliterator fromHex = Transliterator.getInstance("Hex-Any");
    InfoDialog helpDialog;
@ -625,11 +626,30 @@ public class Demo extends Frame {
                    first = false;
                }
            }
+            int dashPos = id.indexOf('-');
+            int slashPos = id.indexOf('/');
+            if (slashPos < 0) slashPos = id.length();
+            UnicodeSet sourceSuper = null;
+            try {
+                sourceSuper = new UnicodeSet("[:" + id.substring(0,dashPos) + ":]");
+            } catch (Exception e) {}
+            
+            UnicodeSet targetSuper = null;
+            try {
+                targetSuper = new UnicodeSet("[:" + id.substring(dashPos+1, slashPos) + ":]");
+            } catch (Exception e) {}
+            
            out.println("</table><ul>");
-            out.println("<li>Source Set:<ul><li>" + translit.getSourceSet().toPattern(true) + "</li></ul></li>");
-            out.println("<li>Reverse Target Set:<ul><li>" + lt.getTargetSet().toPattern(true) + "</li></ul></li>");
-            out.println("<li>Target Set:<ul><li>" + translit.getTargetSet().toPattern(true) + "</li></ul></li>");
-            out.println("<li>Reverse Source Set:<ul><li>" + lt.getSourceSet().toPattern(true) + "</li></ul></li>");
+            out.println("<p><b>NFD</b></p>");
+            out.println("<li>Source Set:<ul><li>" + toPattern(closeUnicodeSet(translit.getSourceSet(), Normalizer.NFD, true), sourceSuper) + "</li></ul></li>");
+            out.println("<li>Reverse Target Set:<ul><li>" + toPattern(closeUnicodeSet(lt.getTargetSet(), Normalizer.NFD, true), sourceSuper) + "</li></ul></li>");
+            out.println("<li>Target Set:<ul><li>" + toPattern(closeUnicodeSet(translit.getTargetSet(), Normalizer.NFD, true), targetSuper) + "</li></ul></li>");
+            out.println("<li>Reverse Source Set:<ul><li>" + toPattern(closeUnicodeSet(lt.getSourceSet(), Normalizer.NFD, true), targetSuper) + "</li></ul></li>");
+            out.println("<p><b>NFKD</b></p>");
+            out.println("<li>Source Set:<ul><li>" + toPattern(closeUnicodeSet(translit.getSourceSet(), Normalizer.NFKD, true), sourceSuper) + "</li></ul></li>");
+            out.println("<li>Reverse Target Set:<ul><li>" + toPattern(closeUnicodeSet(lt.getTargetSet(), Normalizer.NFKD, true), sourceSuper) + "</li></ul></li>");
+            out.println("<li>Target Set:<ul><li>" + toPattern(closeUnicodeSet(translit.getTargetSet(), Normalizer.NFKD, true), targetSuper) + "</li></ul></li>");
+            out.println("<li>Reverse Source Set:<ul><li>" + toPattern(closeUnicodeSet(lt.getSourceSet(), Normalizer.NFKD, true), targetSuper) + "</li></ul></li>");
            out.println("</ul></body>");
            out.close();
            System.out.println("Done Writing");
@ -638,6 +658,62 @@ public class Demo extends Frame {
        }
    }
    
+    
+    static UnicodeSet closeUnicodeSet(UnicodeSet source, Normalizer.Mode mode, boolean caseToo) {
+        UnicodeSetIterator it = new UnicodeSetIterator(source);
+        UnicodeSet additions = new UnicodeSet(); // to avoid messing up iterator
+        int cp;
+        
+        // First add all case equivalents
+        if (caseToo) {
+            while (it.next()) {
+                cp = it.codepoint;
+                if (cp == it.IS_STRING) continue;
+                int type = UCharacter.getType(cp);
+                if (type == Character.UPPERCASE_LETTER || type == Character.LOWERCASE_LETTER || type == Character.TITLECASE_LETTER) {
+                    additions.add(UCharacter.toLowerCase(UTF16.valueOf(cp)));
+                    additions.add(UCharacter.toUpperCase(UTF16.valueOf(cp)));
+                }
+            }
+            source.addAll(additions);
+            additions.clear();
+        }
+       
+        // Now add all decompositions of characters in source
+        it.reset(source);
+        while (it.next()) {
+            cp = it.codepoint;
+            if (cp == it.IS_STRING) continue;
+            if (Normalizer.isNormalized(cp, mode)) continue;
+            String decomp = Normalizer.normalize(cp, mode);
+            additions.add(decomp);
+        }
+        source.addAll(additions);
+        
+        // Now add any other character that decomposes to a character in source
+        for (cp = 0; cp < 0x10FFFF; ++cp) {
+            if (!UCharacter.isDefined(cp)) continue;
+            if (Normalizer.isNormalized(cp, mode)) continue;
+            if (source.contains(cp)) continue;
+            
+            String decomp = Normalizer.normalize(cp, mode);
+            if (source.containsAll(decomp)) {
+                System.out.println("Adding: " + Integer.toString(cp,16) + " " + UCharacter.getName(cp));
+                source.add(cp);
+            }
+        }
+        // For completeness, later we should add the canonical closure of all strings in source
+        return source;
+    }
+    
+    static String toPattern(UnicodeSet source, UnicodeSet superset) {
+        if (superset != null) {
+            source.removeAll(superset);
+            return "[" + superset.toPattern(true) + " " + source.toPattern(true) + "]";
+        }
+        return source.toPattern(true);
+    }
+    
    static BreakIterator bi = BreakIterator.getWordInstance();
    
    static String titlecaseFirstWord(String line) {
--- a/icu4j/src/com/ibm/icu/dev/test/translit/RoundTripTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/RoundTripTest.java
@ -132,6 +132,23 @@ public class RoundTripTest extends TestFmwk {
          .test("[a-zA-Z\u0110\u0111]", "[\u0400-\u045F]", null, this, new Legal());
    }
    
+    static final String ARABIC = "[\u060C\u061B\u061F\u0621\u0627-\u063A\u0641-\u0655\u0660-\u066C\u067E\u0686\u0698\u06A4\u06AD\u06AF\u06CB-\u06CC\u06F0-\u06F9]";
+
+    public void TestArabic() throws IOException, ParseException {
+        new Test("Latin-Arabic")
+          .test("[a-zA-Z\u02BE\u02BF\u207F]", ARABIC, null, this, new Legal());
+    }
+    
+    public void TestHebrew() throws IOException, ParseException {
+        new Test("Latin-Hebrew")
+          .test("[a-zA-Z\u02BC\u02BB]", "[[:hebrew:]-[\uFB00-\uFBFF]]", "[\u05F0\u05F1\u05F2]", this, new LegalHebrew());
+    }
+    
+    public void TestThai() throws IOException, ParseException {
+        new Test("Latin-Thai")
+          .test("[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268]", "[:thai:]", null, this, new LegalThai());
+    }
+    
    //----------------------------------
    // Inter-Indic Tests
    //----------------------------------
@ -615,6 +632,38 @@ public class RoundTripTest extends TestFmwk {
        }
    }
    
+    // anything is legal except word ending with Logical-order-exception
+    public static class LegalThai extends Legal {
+        public boolean is(String sourceString) {
+            if (sourceString.length() == 0) return true;
+            char ch = sourceString.charAt(sourceString.length() - 1); // don't worry about surrogates.
+            if (UCharacter.hasBinaryProperty(ch, UProperty.LOGICAL_ORDER_EXCEPTION)) return false;
+            return true;
+        }
+    }
+    
+    // anything is legal except that Final letters can't be followed by letter; NonFinal must be
+    public static class LegalHebrew extends Legal {
+        static UnicodeSet FINAL = new UnicodeSet("[\u05DA\u05DD\u05DF\u05E3\u05E5]");
+        static UnicodeSet NON_FINAL = new UnicodeSet("[\u05DB\u05DE\u05E0\u05E4\u05E6]");
+        static UnicodeSet LETTER = new UnicodeSet("[:letter:]");
+        public boolean is(String sourceString) {
+            if (sourceString.length() == 0) return true;
+            // don't worry about surrogates.
+            for (int i = 0; i < sourceString.length(); ++i) {
+                char ch = sourceString.charAt(i);
+                char next = i+1 == sourceString.length() ? '\u0000' : sourceString.charAt(i);
+                if (FINAL.contains(ch)) {
+                    if (LETTER.contains(next)) return false;
+                } else if (NON_FINAL.contains(ch)) {
+                    if (!LETTER.contains(next)) return false;
+                }   
+            }
+            return true;
+        }
+    }
+    
+    
    public static class LegalGreek extends Legal {
        
        boolean full;
@ -867,7 +916,7 @@ public class RoundTripTest extends TestFmwk {
            String irrelevants = "\u2000\u2001\u2126\u212A\u212B\u2329"; // string is from NFC_NO in the UCD
                
            if (!checkIrrelevants(sourceToTarget, irrelevants)) {
-                logFails("Source-Target, irrelevants");
+                logFails("Source-Target, Must not NFC everything");
            }
            if (!checkIrrelevants(targetToSource, irrelevants)) {
                logFails("Target-Source, irrelevants");
@ -1130,10 +1179,10 @@ public class RoundTripTest extends TestFmwk {
            if (++errorCount > errorLimit) {
                throw new TestTruncated("Test truncated; too many failures");
            }
-            out.println("<br>Fail " + label + ": " +
-                        from + " (" +
-                        TestUtility.hex(from) + ") => " +
-                        to + " (" +
+            out.println("<br>Fail " + label + ": \u200E" +
+                        from + "\u200E (" +
+                        TestUtility.hex(from) + ") => \u200E" +
+                        to + "\u200E (" +
                        TestUtility.hex(to) + ")"
                        );
        }
@ -1142,15 +1191,15 @@ public class RoundTripTest extends TestFmwk {
            if (++errorCount > errorLimit) {
                throw new TestTruncated("Test truncated; too many failures");
            }
-            out.println("<br>Fail (can.equiv)" + label + ": " +
-                        from + " (" +
-                        TestUtility.hex(from) + ") => " +
-                        to + " (" +
+            out.println("<br>Fail (can.equiv) " + label + ": \u200E" +
+                        from + "\u200E (" +
+                        TestUtility.hex(from) + ") => \u200E" +
+                        to + "\u200E (" +
                        TestUtility.hex(to) + ")" +
-                        " -- " +
-                        fromCan + " (" +
-                        TestUtility.hex(fromCan) + ") => " +
-                        toCan + " (" +
+                        " -- \u200E" +
+                        fromCan + "\u200E (" +
+                        TestUtility.hex(fromCan) + ") => \u200E" +
+                        toCan + "\u200E (" +
                        TestUtility.hex(toCan) + ")"
                        );
        }
@ -1166,12 +1215,12 @@ public class RoundTripTest extends TestFmwk {
            if (++errorCount > errorLimit) {
                throw new TestTruncated("Test truncated; too many failures");
            }
-            out.println("<br>Fail (can.equiv)" + label + ": " +
-                        from + " (" +
-                        TestUtility.hex(from) + ") => " +
-                        to + " (" +
-                        TestUtility.hex(to) + ")" +
-                        toCan + " (" +
+            out.println("<br>Fail (can.equiv) " + label + ": \u200E" +
+                        from + "\u200E (" +
+                        TestUtility.hex(from) + ") => \u200E" +
+                        to + "\u200E (" +
+                        TestUtility.hex(to) + ")\u200E" +
+                        toCan + "\u200E (" +
                        TestUtility.hex(toCan) + ")"
                        );
        }
@ -1182,12 +1231,12 @@ public class RoundTripTest extends TestFmwk {
            if (++errorCount > errorLimit) {
                throw new TestTruncated("Test truncated; too many failures");
            }
-            out.println("<br>Fail Roundtrip: " + 
-                        from + " (" +
-                        TestUtility.hex(from) + ") "+toID+"=> " +
-                        to + " (" +
-                        TestUtility.hex(to) + ") " + backID+"=> " +
-                        back + " (" +
+            out.println("<br>Fail Roundtrip: \u200E" + 
+                        from + "\u200E (" +
+                        TestUtility.hex(from) + ") "+toID+"=> \u200E" +
+                        to + "\u200E (" +
+                        TestUtility.hex(to) + ") " + backID+"=> \u200E" +
+                        back + "\u200E (" +
                        TestUtility.hex(back) + ")" 
                        );
        }
--- a/icu4j/src/com/ibm/icu/impl/data/Transliterator_Arabic_Latin.txt
+++ b/icu4j/src/com/ibm/icu/impl/data/Transliterator_Arabic_Latin.txt
@ -3,8 +3,8 @@
 # Corporation and others. All Rights Reserved.
 #--------------------------------------------------------------------
 # $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_Arabic_Latin.txt,v $
-# $Date: 2002/07/14 22:02:01 $
-# $Revision: 1.1 $
+# $Date: 2002/07/15 01:26:18 $
+# $Revision: 1.2 $
 #--------------------------------------------------------------------

 # Generally follows UNGEGN <http://www.eki.ee/wgrs/rom1_ar.pdf>
@ -18,8 +18,8 @@
 # While it could be done, we need to determine whether a prefix "al" could
 # occur other than as the definite article (since no space is used).

-:: NFD (NFC);
-:: lower () ;
+:: [[:Arabic:] [‎ⁿ\u060C\u061B\u061F\u0640\u064B-\u0655\u0660-\u066C\u06F0-\u06F9\uFDFC]] ;
+:: NFKD (NFC);
 $disambig =  ̱ ; 
 $disambig2 =  ̰ ;
 $under =  ̣ ;
@ -74,6 +74,10 @@ $under =  ̣ ;
 ظ <> z $under ; # ARABIC LETTER ZAH
 غ <> g h $disambig ; # ARABIC LETTER GHAIN

+# WARNING: special case
+# These canonically rearrange, so we have to special-case the return
+ةٕ < ẗ̹ ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS
+
 # non-Arabic language
 ژ <> z h $disambig ; # ARABIC LETTER JEH
 ڭ <> n $disambig g ; # ARABIC LETTER NG
@ -129,9 +133,13 @@ $under =  ̣ ;
 گ <> g ; # ARABIC LETTER GAF

 # fallbacks
-
+| s < c } [eiy];
+| k < c ;
 | i < e ;
 | u < o ;
 | ks < x ;
+| n < ‎ⁿ;

-::NFC (NFD);
+:: (lower) ;
+::NFC (NFD);
+:: ( [[:Latin:] [%,.0-9;?\u02BE-\u02BF\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339\u037E]] );
--- a/icu4j/src/com/ibm/icu/impl/data/Transliterator_Hebrew_Latin.txt
+++ b/icu4j/src/com/ibm/icu/impl/data/Transliterator_Hebrew_Latin.txt
@ -3,8 +3,8 @@
 # Corporation and others. All Rights Reserved.
 #--------------------------------------------------------------------
 # $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_Hebrew_Latin.txt,v $
-# $Date: 2002/07/14 21:59:17 $
-# $Revision: 1.2 $
+# $Date: 2002/07/15 01:26:18 $
+# $Revision: 1.3 $
 #--------------------------------------------------------------------

 # Transliteration table for Hebrew
@ -27,8 +27,8 @@
 # For more information, see"
 #   http://oss.software.ibm.com/icu/userguide/Transliteration.html

-:: nfd (nfc) ;
-:: (lower);
+:: [[:Hebrew:] [\u05B0-\u05B9\u05BB-\u05BC\u05C1-\u05C2\u2135-\u2138]] ;
+:: nfkd (nfc) ;
 $letterAfter = [:M:]* [:L:] ;

 # move longer items here to avoid masking
@ -62,6 +62,11 @@ $letterAfter = [:M:]* [:L:] ;
 ק <> q ;
 ר <> r ;

+ װ > |  וו; # HEBREW LIGATURE YIDDISH DOUBLE VAV
+ ױ > | וי; # HEBREW LIGATURE YIDDISH VAV YOD
+ ײ > | יי ; # HEBREW LIGATURE YIDDISH DOUBLE YOD
+
+
 ּ <> ̇ ; # dagesh just goes to overdot for now
 ׁ <> ̌ ; # shin dot -> sh
 ׂ <> ̂ ; # sin dot -> s
@ -88,4 +93,6 @@ $letterAfter = [:M:]* [:L:] ;
 ו < v ;
 כס < x ;

-:: nfc (nfd) ;
+:: (lower);
+:: nfc (nfd) ;
+:: ([[:Latin:] [\u02BB-\u02BC\u0300-\u0302\u0307\u030C\u0327\u0331\u0340-\u0341]]);
--- a/icu4j/src/com/ibm/icu/impl/data/Transliterator_ThaiLogical_Latin.txt
+++ b/icu4j/src/com/ibm/icu/impl/data/Transliterator_ThaiLogical_Latin.txt
@ -3,8 +3,8 @@
 # Corporation and others. All Rights Reserved.
 #--------------------------------------------------------------------
 # $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_ThaiLogical_Latin.txt,v $
-# $Date: 2002/07/13 03:36:59 $
-# $Revision: 1.2 $
+# $Date: 2002/07/15 01:26:18 $
+# $Revision: 1.3 $
 #--------------------------------------------------------------------

 # Thai-Latin
@ -23,11 +23,11 @@

 # insert implicit vowel (and remove it going the other way)

-$consonant = [\u0E01-\u0E2E];
-$vowel = [\u0E30-\u0E3A\u0E40-\u0E44\u0E47];
+$consonant = [ก-ฮ];
+$vowel = [ะ-ฺเ-ไ็];

-{ ( $consonant ) } [^$vowel \uE000] > | $1 \uE000 ;
-\uE000 > ọ ;
+{ ( $consonant ) } [^$vowel ] > | $1  ;
+ > ọ ;
 < ọ ;

 # Consonants