From eb6243eb238e1c7a03f24d0b71df19d7c7a8c0d9 Mon Sep 17 00:00:00 2001
From: Mark Davis <mark@macchiato.com>
Date: Sat, 22 Jun 2002 21:05:34 +0000
Subject: [PATCH] added more conformance tests

X-SVN-Rev: 8928
---
 tools/unicodetools/com/ibm/text/UCA/Main.java | 10 +++--
 .../com/ibm/text/UCA/WriteCollationData.java  | 45 +++++++++++++++----
 .../com/ibm/text/UCD/DerivedProperty.java     |  6 +--
 .../com/ibm/text/UCD/Normalizer.java          | 17 ++++++-
 .../ibm/text/data/CheckCollationValidity.html |  2 +-
 .../com/ibm/text/data/FractionalUCA.txt       |  2 +-
 .../com/ibm/text/data/FractionalUCA_long.txt  |  2 +-
 7 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/tools/unicodetools/com/ibm/text/UCA/Main.java b/tools/unicodetools/com/ibm/text/UCA/Main.java
index 203bebd66bf..89f93cc0298 100644
--- a/tools/unicodetools/com/ibm/text/UCA/Main.java
+++ b/tools/unicodetools/com/ibm/text/UCA/Main.java
@@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $ 
-* $Date: 2002/06/15 02:47:12 $ 
-* $Revision: 1.6 $
+* $Date: 2002/06/22 21:02:16 $ 
+* $Revision: 1.7 $
 *
 *******************************************************************************
 */
@@ -19,8 +19,10 @@ import com.ibm.text.utility.*;
 public class Main {
 	static final String UCDVersion = "";
 	static final String[] ICU_FILES = {"writeCollationValidityLog", "FractionalUCA",
-	    "writeconformance", "writeconformanceshifted", 
-		"WriteRules", "WriteRulesWithNames", "WriteRulesXML"};
+		"WriteRules", "WriteRulesWithNames", "WriteRulesXML",
+		"writeconformance", "writeconformanceshifted", 
+		"short", "writeconformance", "writeconformanceshifted", 
+    };
 	
 	public static void main(String args[]) throws Exception {
 		
diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
index f26f5a39dd5..bb6a895fb70 100644
--- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
@@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ 
-* $Date: 2002/06/22 01:21:08 $ 
-* $Revision: 1.21 $
+* $Date: 2002/06/22 21:02:16 $ 
+* $Revision: 1.22 $
 *
 *******************************************************************************
 */
@@ -292,6 +292,7 @@ public class WriteCollationData implements UCD_Types, UCA_Types {
     
     
     static void writeConformance(String filename, byte option, boolean shortPrint)  throws IOException {
+        Default.setUCD();
         //UCD ucd30 = UCD.make("3.0.0");
         
 /*
@@ -405,12 +406,16 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
             //log.println(source);
             char extra = source.charAt(source.length()-1);
             String clipped = source.substring(0, source.length()-1);
+            if (clipped.charAt(0) == LOW_ACCENT && extra != LOW_ACCENT) {
+                extra = LOW_ACCENT;
+                clipped = source.substring(1);
+            }
             if (!shortPrint) {
                 log.print(Utility.hex(source));
                 log.print(
                     ";\t# " + (extra != LOW_ACCENT ? extra : '.') + " " + ucd.getName(clipped, SHORT) + "\t" + UCA.toString(key));
             } else {
-                log.print(source + "\t" + Utility.hex(clipped));
+                log.print(Utility.hex(source) + "\t" + Utility.hex(clipped));
             }
             log.println();
         }
@@ -424,17 +429,41 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
         addStringX(UTF32.valueOf32(x), option);
     }
     
-    static final char LOW_ACCENT = '\u0325';
+    static final char LOW_ACCENT = '\u0334';
+    static int addCounter = 0;
    
     static void addStringX(String s, byte option) {
+        int firstChar = UTF16.charAt(s,0);
+        // add characters with different strengths, to verify the order
         addStringY(s + 'a', option);
-        addStringY(s + 'A', option);
-        addStringY(s + 'á', option);
         addStringY(s + 'b', option);
-        addStringY(s + LOW_ACCENT, option);
+        addStringY(s + 'á', option);
+        addStringY(s + 'A', option);
         addStringY(s + '!', option);
+        if (option == SHIFTED && collator.isVariable(firstChar)) addStringY(s + LOW_ACCENT, option);
+        
+        // NOW, if the character decomposes, or is a combining mark (non-zero), try combinations
+        
+        if (Default.ucd.getCombiningClass(firstChar) > 0 
+            || !Default.nfd.isNormalized(s) && !Default.ucd.isHangulSyllable(firstChar)) {
+        // if it ends with a non-starter, try the decompositions.
+            String decomp = Default.nfd.normalize(s);
+            if (Default.ucd.getCombiningClass(UTF16.charAt(decomp, decomp.length()-1)) > 0) {
+                if (canIt == null) canIt = new CanonicalIterator(".");
+                canIt.setSource(s + LOW_ACCENT);
+                int limit = 4;
+                for (String can = canIt.next(); can != null; can = canIt.next()) {
+                    if (s.equals(can)) continue;
+                    if (--limit < 0) continue; // just include a sampling
+                    addStringY(can, option);
+                    // System.out.println(addCounter++ + " Adding " + Default.ucd.getCodeAndName(can));
+                }
+            }
+        }
     }
     
+    static CanonicalIterator canIt = null;
+    
     static char counter;
     
     static void addStringY(String s, byte option) {
@@ -2162,7 +2191,7 @@ F900..FAFF; CJK Compatibility Ideographs
         
         Set additionalSet = new HashSet();
         System.out.println("Loading canonical iterator");
-        CanonicalIterator canIt = new CanonicalIterator(".");
+        if (canIt == null) canIt = new CanonicalIterator(".");
         Iterator it2 = contentsForCanonicalIteration.iterator();
         System.out.println("Adding any FCD equivalents that have different sort keys");
         while (it2.hasNext()) {
diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
index 9039052aa7d..1b0b4e1b59b 100644
--- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
+++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
@@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
-* $Date: 2002/06/22 01:21:09 $
-* $Revision: 1.15 $
+* $Date: 2002/06/22 21:02:16 $
+* $Revision: 1.16 $
 *
 *******************************************************************************
 */
@@ -745,8 +745,8 @@ of characters, the first of which has a non-zero combining class.
     
     public static void test() {
         Default.setUCD();
-        DerivedProperty dprop = new DerivedProperty(Default.ucd);
         /*
+        DerivedProperty dprop = new DerivedProperty(Default.ucd);
         for (int j = 0; j < LIMIT; ++j) {
             System.out.println();
             System.out.println(j + "\t" + dprop.getName(j));
diff --git a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java
index f05e38c7ad7..47f3c33971b 100644
--- a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java
@@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
-* $Date: 2002/06/13 21:14:05 $
-* $Revision: 1.10 $
+* $Date: 2002/06/22 21:02:16 $
+* $Revision: 1.11 $
 *
 *******************************************************************************
 */
@@ -226,6 +226,19 @@ public final class Normalizer implements UCD_Types {
         return !data.normalizationDiffers(ch, composition, compatibility);
     }
 
+    /**
+    * Utility: Checks whether there is a recursive decomposition of a character from the
+    * Unicode Character Database. It is compatibility or canonical according to the particular
+    * normalizer.
+    * @param   ch      the source character
+    */
+    public boolean isNormalized(String s) {
+        if (UTF16.countCodePoint(s) > 1) {
+            return !data.normalizationDiffers(UTF16.charAt(s,0), composition, compatibility);
+        }
+        return s.equals(normalize(s)); // TODO: OPTIMIZE LATER
+    }
+
     /**
     * Utility: Gets recursive decomposition of a character from the
     * Unicode Character Database.
diff --git a/tools/unicodetools/com/ibm/text/data/CheckCollationValidity.html b/tools/unicodetools/com/ibm/text/data/CheckCollationValidity.html
index 6d003d93a2c..491a7ab5635 100644
--- a/tools/unicodetools/com/ibm/text/data/CheckCollationValidity.html
+++ b/tools/unicodetools/com/ibm/text/data/CheckCollationValidity.html
@@ -1,7 +1,7 @@
 <html><body>
 <h1
  >1. Mismatches when NFD is OFF</h1><h2
- >Date:Fri Jun 21 16:56:03 PDT 2002</h2><h2
+ >Date:Sat Jun 22 13:56:49 PDT 2002</h2><h2
  >File Version:-3.1.1d1</h2><p
  >Alternate Handling = NON_IGNORABLE</p><table border="1"
  ><caption
diff --git a/tools/unicodetools/com/ibm/text/data/FractionalUCA.txt b/tools/unicodetools/com/ibm/text/data/FractionalUCA.txt
index 999ed5d7457..42dabc8cd6b 100644
--- a/tools/unicodetools/com/ibm/text/data/FractionalUCA.txt
+++ b/tools/unicodetools/com/ibm/text/data/FractionalUCA.txt
@@ -1,5 +1,5 @@
 # Fractional UCA Table, generated from standard UCA
-# M. Davis, Fri Jun 21 16:56:12 PDT 2002
+# M. Davis, Sat Jun 22 13:56:57 PDT 2002
 # VERSION: UCA=3.1.1d1, UCD=3.2.0
 
 # Generated processed version, as described in ICU design document.
diff --git a/tools/unicodetools/com/ibm/text/data/FractionalUCA_long.txt b/tools/unicodetools/com/ibm/text/data/FractionalUCA_long.txt
index 33d13674914..2f85b79ca60 100644
--- a/tools/unicodetools/com/ibm/text/data/FractionalUCA_long.txt
+++ b/tools/unicodetools/com/ibm/text/data/FractionalUCA_long.txt
@@ -1,5 +1,5 @@
 # Fractional UCA Table, generated from standard UCA
-# M. Davis, Fri Jun 21 16:56:12 PDT 2002
+# M. Davis, Sat Jun 22 13:56:57 PDT 2002
 # VERSION: UCA=3.1.1d1, UCD=3.2.0
 
 # Generated processed version, as described in ICU design document.