ICU-1682 fixed exhaustive test.

Added 20% optimization (doesn't try to permute class zeros) X-SVN-Rev: 8100
2025-04-11 08:01:32 +00:00 · 2002-03-19 00:18:44 +00:00 · 2002-03-19 00:18:44 +00:00 · 5394fe42d1
commit 5394fe42d1
parent d55cb3d234
2 changed files with 155 additions and 29 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/normalizer/TestCanonicalIterator.java
+++ b/icu4j/src/com/ibm/icu/dev/test/normalizer/TestCanonicalIterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/normalizer/TestCanonicalIterator.java,v $ 
- * $Date: 2002/03/14 22:43:03 $ 
- * $Revision: 1.6 $
+ * $Date: 2002/03/19 00:18:44 $ 
+ * $Revision: 1.7 $
 *
 *****************************************************************************************
 */
@ -41,7 +41,18 @@ public class TestCanonicalIterator extends TestFmwk {
    
    public void TestExhaustive() {
    	int counter = 0;
+    	int mixedCounter = 0;
+    	int lastMixedCounter = -1;
    	CanonicalIterator it = new CanonicalIterator("");
+    	/*
+    	CanonicalIterator slowIt = new CanonicalIterator("");
+    	slowIt.SKIP_ZEROS = false;
+    	*/
+    	Transliterator name = Transliterator.getInstance("[^\\u0020-\\u007F] name");
+    	Set itSet = new TreeSet();
+    	Set slowItSet = new TreeSet();
+    	
+    	
    	for (int i = 0; i < 0x10FFFF; ++i) {
    		
    		// skip characters we know don't have decomps
@ -65,16 +76,99 @@ public class TestCanonicalIterator extends TestFmwk {
    		while (true) {
    			String item = it.next();
    			if (item == null) break;
-    			if (!item.equals(s)) gotSource = true;
-    			if (!item.equals(decomp)) gotDecomp = true;
-    			if (!item.equals(comp)) gotComp = true;
+    			if (item.equals(s)) gotSource = true;
+    			if (item.equals(decomp)) gotDecomp = true;
+    			if (item.equals(comp)) gotComp = true;
+    			if ((mixedCounter & 0x7F) == 0 && (i < 0xAD00 || i > 0xAC00 + 11172)) {
+    				if (lastMixedCounter != mixedCounter) {
+    					logln("");
+    					lastMixedCounter = mixedCounter;
+    				}
+    				logln("\t" + mixedCounter + "\t" + name.transliterate(item)
+    				+ (item.equals(s) ? "\t(*original*)" : "")
+    				+ (item.equals(decomp) ? "\t(*decomp*)" : "")
+    				+ (item.equals(comp) ? "\t(*comp*)" : "")
+    				);
+    			}
    		}
+    		
+    		// check that zeros optimization doesn't mess up.
+    		/*
+    		if (true) {
+    			it.reset();
+    			itSet.clear();
+    			while (true) {
+    				String item = it.next();
+    				if (item == null) break;
+    				itSet.add(item);
+    			}
+    			slowIt.setSource(s);
+    			slowItSet.clear();
+    			while (true) {
+    				String item = slowIt.next();
+    				if (item == null) break;
+    				slowItSet.add(item);
+    			}
+    			if (!itSet.equals(slowItSet)) {
+    				errln("Zero optimization failure with " + getReadable(s));
+    			}
+    		}
+    		*/
+    		
+    		mixedCounter++;
    		if (!gotSource || !gotDecomp || !gotComp) {
    			errln("FAIL CanonicalIterator: " + s);
    		}
    	}
    }
    
+    public int TestSpeed() {
+         // skip unless verbose
+        if (!isVerbose()) return 0;
+
+   		String s = "\uAC01\u0345";
+   		
+    	CanonicalIterator it = new CanonicalIterator(s);
+    	double start, end;
+    	int x = 0; // just to keep code from optimizing away.
+    	int iterations = 10000;
+    	double slowDelta = 0;
+    	
+    	/*
+    	CanonicalIterator slowIt = new CanonicalIterator(s);
+    	slowIt.SKIP_ZEROS = false;
+
+    	start = System.currentTimeMillis();
+    	for (int i = 0; i < iterations; ++i) {
+    		slowIt.setSource(s);
+    		while (true) {
+    			String item = slowIt.next();
+    			if (item == null) break;
+    			x += item.length();
+    		}
+    	}
+    	end = System.currentTimeMillis();
+    	double slowDelta = (end-start) / iterations;
+    	logln("Slow iteration: " + slowDelta);
+    	*/
+
+    	start = System.currentTimeMillis();
+    	for (int i = 0; i < iterations; ++i) {
+    		it.setSource(s);
+    		while (true) {
+    			String item = it.next();
+    			if (item == null) break;
+    			x += item.length();
+    		}
+    	}
+    	end = System.currentTimeMillis();
+    	double fastDelta = (end-start) / iterations;
+    	logln("Fast iteration: " + fastDelta + (slowDelta != 0 ? ", " + (fastDelta/slowDelta) : ""));
+    	
+    	
+    	return x;
+    }
+    
    public void TestBasic() {
        // check build
        UnicodeSet ss = CanonicalIterator.getSafeStart();
@ -88,7 +182,9 @@ public class TestCanonicalIterator extends TestFmwk {
        // check permute
        // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
        
-        expectEqual("Simple permutation ", "", collectionToString(new TreeSet(CanonicalIterator.permute("ABC"))), "ABC, ACB, BAC, BCA, CAB, CBA");
+        Set results = new TreeSet();
+        CanonicalIterator.permute("ABC", false, results);
+        expectEqual("Simple permutation ", "", collectionToString(results), "ABC, ACB, BAC, BCA, CAB, CBA");
        
        // try samples
        SortedSet set = new TreeSet();
--- a/icu4j/src/com/ibm/icu/text/CanonicalIterator.java
+++ b/icu4j/src/com/ibm/icu/text/CanonicalIterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CanonicalIterator.java,v $ 
- * $Date: 2002/03/14 22:43:03 $ 
- * $Revision: 1.7 $
+ * $Date: 2002/03/19 00:18:44 $ 
+ * $Revision: 1.8 $
 *
 *****************************************************************************************
 */
@ -144,40 +144,50 @@ public class CanonicalIterator {
     * Simple implementation of permutation. 
 	 *<br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
     * @param source the string to find permutations for
-     * @return the results in a set.
+     * @param the set to add the results to
     * @internal
     */
-    public static Set permute(String source) {
+    public static void permute(String source, boolean skipZeros, Set output) {
    	// TODO: optimize
        //if (PROGRESS) System.out.println("Permute: " + source);
-        Set result = new TreeSet();
        
        // optimization:
        // if zero or one character, just return a set with it
        // we check for length < 2 to keep from counting code points all the time
        if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
-            result.add(source);
-            return result;
+            output.add(source);
+            return;
        }
        
        // otherwise iterate through the string, and recursively permute all the other characters
+        Set subpermute = new HashSet();
        int cp;
        for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
            cp = UTF16.charAt(source, i);
-            String chStr = UTF16.valueOf(source, i);
+            
+        	// optimization:
+        	// if the character is canonical combining class zero,
+        	// don't permute it
+        	if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
+        		//System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
+	        	continue;
+        	}            
            
            // see what the permutations of the characters before and after this one are
-            Set subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
+            subpermute.clear();
+            permute(source.substring(0,i) 
+            	+ source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
            
            // prefix this character to all of them
+            String chStr = UTF16.valueOf(source, i);
            Iterator it = subpermute.iterator();
            while (it.hasNext()) {
                String piece = chStr + (String) it.next();
                //if (PROGRESS) System.out.println("  Piece: " + piece);
-                result.add(piece);
+                output.add(piece);
            }
        }
-        return result;
+        return;
    }
    
    // FOR TESTING
@ -206,6 +216,7 @@ public class CanonicalIterator {
    // debug
    private static boolean PROGRESS = false; // debug progress
    private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
+    private static boolean SKIP_ZEROS = true;
 
    // fields
    private String source;
@ -222,8 +233,9 @@ public class CanonicalIterator {
    
    // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
    private String[] getEquivalents(String segment) {
-        Set result = new TreeSet();
+        Set result = new HashSet();
        Set basic = getEquivalents2(segment);
+        Set permutations = new HashSet();
        
        // now get all the permutations
        // add only the ones that are canonically equivalent
@ -231,16 +243,24 @@ public class CanonicalIterator {
        Iterator it = basic.iterator();
        while (it.hasNext()) {
            String item = (String) it.next();
-            Set permutations = permute(item);
+        	permutations.clear();
+            permute(item, SKIP_ZEROS, permutations);
            Iterator it2 = permutations.iterator();
            while (it2.hasNext()) {
                String possible = (String) it2.next();
-                String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
-                if (attempt.equals(segment)) {
-                    if (PROGRESS) System.out.println("Adding Permutation: " + NAME.transliterate(possible));
-                    result.add(possible);
+                
+				String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
+				if (attempt.equals(segment)) {
+					if (PROGRESS) System.out.println("Adding Permutation: " + NAME.transliterate(possible));
+					result.add(possible);
+
+/*
+                if (isEquivalent(possible, Normalizer.DECOMP)) {
+            		if (PROGRESS) System.out.println("Adding Permutation: " + NAME.transliterate(possible));
+                	result.add(possible);
+*/
                } else {
-                    if (PROGRESS) System.out.println("-Skipping Permutation: " + NAME.transliterate(possible));
+            		if (PROGRESS) System.out.println("-Skipping Permutation: " + NAME.transliterate(possible));
                }
            }
        }
@ -251,8 +271,16 @@ public class CanonicalIterator {
        return finalResult;
    }
    
+    /**
+     * TODO: Should be method on Normalizer; and MUCH faster
+     */
+     
+    private boolean isEquivalent(String possible, Normalizer.Mode normalizerType) {
+        return possible.equals(Normalizer.normalize(possible, normalizerType, 0));
+	}
+    
    private Set getEquivalents2(String segment) {
-        Set result = new TreeSet();
+        Set result = new HashSet();
        if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
        result.add(segment);
        StringBuffer workingBuffer = new StringBuffer();
@ -349,17 +377,19 @@ public class CanonicalIterator {
        return getEquivalents2(remainder);
    }
    
+    /*
    // TODO: fix once we have a codepoint interface to get the canonical combining class
    // TODO: Need public access to canonical combining class in UCharacter!
    private static int getClass(int cp) {
        return Normalizer.getClass((char)cp);
    }
+    */
    
   // ================= BUILDER =========================
    // TODO: Flatten this data so it doesn't have to be reconstructed each time!
    
    private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
-    private static final Set SET_WITH_NULL_STRING = new TreeSet(); // constant, don't change
+    private static final Set SET_WITH_NULL_STRING = new HashSet(); // constant, don't change
    static {
        SET_WITH_NULL_STRING.add("");
    }
@ -367,7 +397,7 @@ public class CanonicalIterator {
    private static UnicodeSet SAFE_START = new UnicodeSet();
    private static CharMap AT_START = new CharMap();
    
-        // WARNING, NORMALIZER doesn't have supplementaries yet;
+        // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
        // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
    private static int LAST_UNICODE = 0xFFFF;
    static {
@ -380,7 +410,7 @@ public class CanonicalIterator {
        if (PROGRESS) System.out.println("Getting Safe Start");
        for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
            if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
-            int cc = getClass(cp);
+            int cc = UCharacter.getCombiningClass(cp);
            if (cc == 0) SAFE_START.add(cp);
            // will fix to be really safe below
        }
@ -404,7 +434,7 @@ public class CanonicalIterator {
                component = UTF16.charAt(decomp, i);
                if (i == 0) {
                    AT_START.add(component, cp);
-                } else if (getClass(component) == 0) {
+                } else if (UCharacter.getCombiningClass(component) == 0) {
                    SAFE_START.remove(component);
                }
            }