From ad9daf070c3af2d974e4a8f24a6ce93e8143bfe2 Mon Sep 17 00:00:00 2001
From: Mark Davis <mark@macchiato.com>
Date: Thu, 8 Aug 2002 15:35:01 +0000
Subject: [PATCH] quick and dirty collation test

X-SVN-Rev: 9615
---
 .../com/ibm/text/UCD/CheckCollator.java       | 351 ++++++++++++++++++
 tools/unicodetools/com/ibm/text/UCD/Main.java |   5 +-
 2 files changed, 354 insertions(+), 2 deletions(-)
 create mode 100644 tools/unicodetools/com/ibm/text/UCD/CheckCollator.java

diff --git a/tools/unicodetools/com/ibm/text/UCD/CheckCollator.java b/tools/unicodetools/com/ibm/text/UCD/CheckCollator.java
new file mode 100644
index 00000000000..521364791ee
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/CheckCollator.java
@@ -0,0 +1,351 @@
+/**
+*******************************************************************************
+* Copyright (C) 1996-2001, International Business Machines Corporation and    *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*
+* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CheckCollator.java,v $
+* $Date: 2002/08/08 15:35:01 $
+* $Revision: 1.1 $
+*
+*******************************************************************************
+*/
+
+// http://java.sun.com/j2se/1.3/docs/guide/intl/encoding.doc.html
+
+package com.ibm.text.UCD;
+
+import java.util.*;
+import java.io.*;
+import java.text.NumberFormat;
+
+import com.ibm.text.utility.*;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * This is a quick and dirty program to get some idea of collation performance, comparing old Java to new stuff.
+ */
+abstract public class CheckCollator {
+    static final String PREFIX = "C:\\ICUInternal\\icu4c\\collation-perf-data\\TestNames_";
+    static final boolean DO_RAW = false;
+    
+    static final NumberFormat nf = NumberFormat.getInstance();
+    static final NumberFormat percent = NumberFormat.getPercentInstance();
+    static {
+        nf.setMaximumFractionDigits(2);
+    }
+    
+    public static void main(String[] args) throws IOException {
+        
+        // later, drive off of args
+        
+        // choices are: Asian, Chinese, Japanese, Japanese_h, Japanese_k, Korean, Latin, Russian, Thai
+        test(Locale.KOREAN, "Korean");
+        test(Locale.ENGLISH, "Latin");
+        test(Locale.FRENCH, "Latin");
+        test(Locale.JAPANESE, "Japanese");
+    }
+    
+    public static void test(Locale loc, String name) throws IOException {
+        
+        System.out.println();
+        System.out.println("Testing " + loc.getDisplayName() + ", file: " + name);
+        System.out.println();
+
+        // get test data
+        
+        String fileName = PREFIX + name + ".txt";
+        
+        FileInputStream fis = new FileInputStream(fileName);
+        InputStreamReader isr = new InputStreamReader(fis, "UnicodeLittle");
+        BufferedReader br = new BufferedReader(isr, 32*1024);
+
+        int counter = 0;
+        
+        ArrayList list = new ArrayList();
+        while (true) {
+            String line = Utility.readDataLine(br);
+            if (line == null) break;
+            if (line.length() == 0) continue;
+            Utility.dot(counter++);
+            list.add(line);
+        }
+        System.out.println("Read " + counter + " lines in file");
+        
+        int limit = 800; // put a limit on it to save time
+        
+        // pump it up if there aren't very many
+        while (list.size() < limit) {
+            list.addAll(list);
+        }
+        
+        int size = list.size();
+        
+        
+        // later, adjust these so we always get a reasonble number of tries
+        
+        int extraIterations = 200;
+        if (size > limit) size = limit;
+        
+        String[] tests = new String [size];
+        
+        for (int i = 0; i < size; ++i) {
+            tests[i] = (String) list.get(i);
+        }
+        
+        // get collators
+        
+        com.ibm.icu.text.Collator newCol = com.ibm.icu.text.Collator.getInstance(loc);
+        java.text.Collator oldCol = java.text.Collator.getInstance(loc);
+        
+        
+        double startTime, endTime;
+        double delta, oldDelta;
+        String probe;
+        
+        
+        // load classes at least once before starting
+        
+        newCol.compare("a", "b");
+        oldCol.compare("a", "b");
+        
+        // ================================================
+        // check sort key size
+        
+        int stringSize = 0, newSize = 0, oldSize = 0;
+        
+        for (int i = 0; i < size; ++i) {
+            stringSize += tests[i].length() * 2;
+            byte[] newKey = newCol.getCollationKey(tests[i]).toByteArray();
+            newSize += newKey.length;
+            byte[] oldKey = oldCol.getCollationKey(tests[i]).toByteArray();
+            oldSize += oldKey.length;
+        }
+        delta = stringSize/(size + 0.0);
+        System.out.println("string size: " + nf.format(delta) + " bytes per key");
+        System.out.println();
+
+        delta = oldDelta = (oldSize/(size + 0.0));
+        System.out.println("old sortkey size: " + nf.format(delta) + " bytes per key ");
+        delta = (newSize/(size + 0.0));
+        System.out.println("new sortkey size: " + nf.format(delta) + " bytes per key " + percent.format(delta/oldDelta));
+        System.out.println();
+       
+        // ================================================
+        // Sort Key: old time
+        
+        // get overhead time
+        counter = 0;
+        startTime = System.currentTimeMillis();
+        
+        for (int i = 0; i < size; ++i) {
+            for (int j = 0; j < size; ++j) {
+                counter++;
+            }
+        }
+        endTime = System.currentTimeMillis();
+        double overhead = (1000*(endTime - startTime) / counter);
+        System.out.println("overhead: " + nf.format((endTime - startTime) / counter) + " micros");
+        
+        counter = 0;
+        startTime = System.currentTimeMillis();
+        
+        for (int i = 0; i < size; ++i) {
+            probe = tests[i];
+            for (int k = 0; k < extraIterations; ++k) {
+                oldCol.getCollationKey(probe);
+                counter++;
+            }
+        }
+        endTime = System.currentTimeMillis();
+        oldDelta = delta = (1000*(endTime - startTime) / counter) - overhead;
+        System.out.println("Old sort key time: " + nf.format(delta)
+            + " micros (" + counter + " iterations)");
+  
+        // Sort Key: new time
+        
+        counter = 0;
+        startTime = System.currentTimeMillis();
+        
+        for (int i = 0; i < size; ++i) {
+            probe = tests[i];
+            for (int k = 0; k < extraIterations; ++k) {
+                newCol.getCollationKey(probe);
+                counter++;
+            }
+        }
+        endTime = System.currentTimeMillis();
+        delta = (1000*(endTime - startTime) / counter) - overhead;
+        System.out.println("New sort key time: " + nf.format(delta)
+            + " micros (" + counter + " iterations) " + percent.format(delta/oldDelta));
+        System.out.println();
+        
+        // ================================================
+        // Raw Compare
+        
+        if (DO_RAW) {
+            // get overhead time
+            counter = 0;
+            startTime = System.currentTimeMillis();
+            int opt = 0; // to keep the compiler from optimizing out
+            
+            for (int i = 0; i < size; ++i) {
+                probe = tests[i];
+                for (int j = 0; j < size; ++j) {
+                    opt ^= probe.compareTo(tests[j]);
+                    counter++;
+                }
+            }
+            endTime = System.currentTimeMillis();
+            overhead = (1000*(endTime - startTime) / counter);
+            System.out.println("overhead: " + nf.format((endTime - startTime) / counter) + " micros");
+            
+            // Raw Compare: old time
+            
+            counter = 0;
+            startTime = System.currentTimeMillis();
+            
+            for (int i = 0; i < size; ++i) {
+                probe = tests[i];
+                for (int j = 0; j < size; ++j) {
+                    opt ^= oldCol.compare(probe, tests[j]);
+                    counter++;
+                }
+            }
+            endTime = System.currentTimeMillis();
+            oldDelta = delta = (1000*(endTime - startTime) / counter) - overhead;
+            System.out.println("Old raw compare time: " + nf.format(delta)
+                + " micros (" + counter + " iterations)");
+            
+            // Raw Compare: new time
+            
+            counter = 0;
+            startTime = System.currentTimeMillis();
+            
+            for (int i = 0; i < size; ++i) {
+                probe = tests[i];
+                for (int j = 0; j < size; ++j) {
+                    opt ^= newCol.compare(probe, tests[j]);
+                    counter++;
+                }
+            }
+            endTime = System.currentTimeMillis();
+            delta = (1000*(endTime - startTime) / counter) - overhead;
+            System.out.println("New raw compare time: " + nf.format(delta)
+                + " micros (" + counter + " iterations) " + percent.format(delta/oldDelta));
+            System.out.println();
+        }
+        
+        // ================================================
+        // Binary Search
+        // note: I don't worry about getting the binary search precisely right, since I just want to
+        // see which strings would get compared.
+        
+        // overhead
+        
+        int iterations = (size * extraIterations);
+        startTime = System.currentTimeMillis();
+        Arrays.sort(tests);
+        int opt2 = 0; // keep from optimizing out
+        
+        for (int i = 0; i < size; ++i) {
+            probe = tests[i];
+            for (int k = 0; k < extraIterations; ++k) {
+                opt2 ^= Arrays.binarySearch(tests, probe);
+            }
+        }
+        endTime = System.currentTimeMillis();
+        overhead = delta = (1000*(endTime - startTime) / iterations);
+        System.out.println("Overhead: " + nf.format(delta)
+            + " micros (" + iterations + " iterations)");
+        
+        // old time
+        
+        startTime = System.currentTimeMillis();
+        Arrays.sort(tests, oldCol);
+        
+        for (int i = 0; i < size; ++i) {
+            probe = tests[i];
+            for (int k = 0; k < extraIterations; ++k) {
+                opt2 ^= Arrays.binarySearch(tests, probe, oldCol);
+            }
+        }
+        endTime = System.currentTimeMillis();
+        oldDelta = delta = (1000*(endTime - startTime) / iterations) - overhead;
+        System.out.println("Old binary search time: " + nf.format(delta)
+            + " micros (" + iterations + " iterations)");
+        
+        
+        // new time
+        
+        Arrays.sort(tests, newCol);
+        
+        startTime = System.currentTimeMillis();
+        
+        for (int i = 0; i < size; ++i) {
+            probe = tests[i];
+            for (int k = 0; k < extraIterations; ++k) {
+                opt2 ^= Arrays.binarySearch(tests, probe, newCol);
+            }
+        }
+        endTime = System.currentTimeMillis();
+        delta = (1000*(endTime - startTime) / iterations) - overhead;
+        System.out.println("New binary search time: " + nf.format(delta)
+            + " micros (" + iterations + " iterations) " + percent.format(delta/oldDelta));
+        System.out.println();
+        
+        // ================================================
+        // Sort
+        
+        String[] sortTests = (String[]) tests.clone();
+        extraIterations = 5;
+        iterations = (size * extraIterations);
+        
+        // overhead
+        
+        startTime = System.currentTimeMillis();
+        
+        for (int i = 0; i < size; ++i) {
+            for (int k = 0; k < extraIterations; ++k) {
+                System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array
+                Arrays.sort(sortTests);
+            }
+        }
+        endTime = System.currentTimeMillis();
+        overhead = delta = (1000*(endTime - startTime) / iterations);
+        System.out.println("overhead: " + nf.format(delta)
+            + " micros (" + iterations + " iterations)");
+        
+        // old time
+        
+        startTime = System.currentTimeMillis();
+        
+        for (int i = 0; i < size; ++i) {
+            for (int k = 0; k < extraIterations; ++k) {
+                System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array
+                Arrays.sort(sortTests, oldCol);
+            }
+        }
+        endTime = System.currentTimeMillis();
+        oldDelta = delta = (1000*(endTime - startTime) / iterations) - overhead;
+        System.out.println("Old sort time: " + nf.format(delta)
+            + " micros (" + iterations + " iterations)");
+        
+        // new time
+        
+        startTime = System.currentTimeMillis();
+        
+        for (int i = 0; i < size; ++i) {
+            for (int k = 0; k < extraIterations; ++k) {
+                System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array
+                Arrays.sort(sortTests, newCol);
+            }
+        }
+        endTime = System.currentTimeMillis();
+        delta = (1000*(endTime - startTime) / iterations) - overhead;
+        System.out.println("New sort time: " + nf.format(delta)
+            + " micros (" + iterations + " iterations) " + percent.format(delta/oldDelta));
+ 
+    }
+}
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java
index 68b630d8457..d715640e12a 100644
--- a/tools/unicodetools/com/ibm/text/UCD/Main.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Main.java
@@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
-* $Date: 2002/08/04 21:38:45 $
-* $Revision: 1.20 $
+* $Date: 2002/08/08 15:35:01 $
+* $Revision: 1.21 $
 *
 *******************************************************************************
 */
@@ -79,6 +79,7 @@ public final class Main implements UCD_Types {
             
             
             else if (arg.equalsIgnoreCase("breaktest")) GenerateBreakTest.main(null);
+            else if (arg.equalsIgnoreCase("checkcollator")) CheckCollator.main(null);
 
             else if (arg.equalsIgnoreCase("genSplit")) GenerateData.genSplit();
             else if (arg.equalsIgnoreCase("iana")) IANANames.testSensitivity();