From ad9daf070c3af2d974e4a8f24a6ce93e8143bfe2 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Thu, 8 Aug 2002 15:35:01 +0000 Subject: [PATCH] quick and dirty collation test X-SVN-Rev: 9615 --- .../com/ibm/text/UCD/CheckCollator.java | 351 ++++++++++++++++++ tools/unicodetools/com/ibm/text/UCD/Main.java | 5 +- 2 files changed, 354 insertions(+), 2 deletions(-) create mode 100644 tools/unicodetools/com/ibm/text/UCD/CheckCollator.java diff --git a/tools/unicodetools/com/ibm/text/UCD/CheckCollator.java b/tools/unicodetools/com/ibm/text/UCD/CheckCollator.java new file mode 100644 index 00000000000..521364791ee --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/CheckCollator.java @@ -0,0 +1,351 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CheckCollator.java,v $ +* $Date: 2002/08/08 15:35:01 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +// http://java.sun.com/j2se/1.3/docs/guide/intl/encoding.doc.html + +package com.ibm.text.UCD; + +import java.util.*; +import java.io.*; +import java.text.NumberFormat; + +import com.ibm.text.utility.*; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; + +/** + * This is a quick and dirty program to get some idea of collation performance, comparing old Java to new stuff. + */ +abstract public class CheckCollator { + static final String PREFIX = "C:\\ICUInternal\\icu4c\\collation-perf-data\\TestNames_"; + static final boolean DO_RAW = false; + + static final NumberFormat nf = NumberFormat.getInstance(); + static final NumberFormat percent = NumberFormat.getPercentInstance(); + static { + nf.setMaximumFractionDigits(2); + } + + public static void main(String[] args) throws IOException { + + // later, drive off of args + + // choices are: Asian, Chinese, Japanese, Japanese_h, Japanese_k, Korean, Latin, Russian, Thai + test(Locale.KOREAN, "Korean"); + test(Locale.ENGLISH, "Latin"); + test(Locale.FRENCH, "Latin"); + test(Locale.JAPANESE, "Japanese"); + } + + public static void test(Locale loc, String name) throws IOException { + + System.out.println(); + System.out.println("Testing " + loc.getDisplayName() + ", file: " + name); + System.out.println(); + + // get test data + + String fileName = PREFIX + name + ".txt"; + + FileInputStream fis = new FileInputStream(fileName); + InputStreamReader isr = new InputStreamReader(fis, "UnicodeLittle"); + BufferedReader br = new BufferedReader(isr, 32*1024); + + int counter = 0; + + ArrayList list = new ArrayList(); + while (true) { + String line = Utility.readDataLine(br); + if (line == null) break; + if (line.length() == 0) continue; + Utility.dot(counter++); + list.add(line); + } + System.out.println("Read " + counter + " lines in file"); + + int limit = 800; // put a limit on it to save time + + // pump it up if there aren't very many + while (list.size() < limit) { + list.addAll(list); + } + + int size = list.size(); + + + // later, adjust these so we always get a reasonble number of tries + + int extraIterations = 200; + if (size > limit) size = limit; + + String[] tests = new String [size]; + + for (int i = 0; i < size; ++i) { + tests[i] = (String) list.get(i); + } + + // get collators + + com.ibm.icu.text.Collator newCol = com.ibm.icu.text.Collator.getInstance(loc); + java.text.Collator oldCol = java.text.Collator.getInstance(loc); + + + double startTime, endTime; + double delta, oldDelta; + String probe; + + + // load classes at least once before starting + + newCol.compare("a", "b"); + oldCol.compare("a", "b"); + + // ================================================ + // check sort key size + + int stringSize = 0, newSize = 0, oldSize = 0; + + for (int i = 0; i < size; ++i) { + stringSize += tests[i].length() * 2; + byte[] newKey = newCol.getCollationKey(tests[i]).toByteArray(); + newSize += newKey.length; + byte[] oldKey = oldCol.getCollationKey(tests[i]).toByteArray(); + oldSize += oldKey.length; + } + delta = stringSize/(size + 0.0); + System.out.println("string size: " + nf.format(delta) + " bytes per key"); + System.out.println(); + + delta = oldDelta = (oldSize/(size + 0.0)); + System.out.println("old sortkey size: " + nf.format(delta) + " bytes per key "); + delta = (newSize/(size + 0.0)); + System.out.println("new sortkey size: " + nf.format(delta) + " bytes per key " + percent.format(delta/oldDelta)); + System.out.println(); + + // ================================================ + // Sort Key: old time + + // get overhead time + counter = 0; + startTime = System.currentTimeMillis(); + + for (int i = 0; i < size; ++i) { + for (int j = 0; j < size; ++j) { + counter++; + } + } + endTime = System.currentTimeMillis(); + double overhead = (1000*(endTime - startTime) / counter); + System.out.println("overhead: " + nf.format((endTime - startTime) / counter) + " micros"); + + counter = 0; + startTime = System.currentTimeMillis(); + + for (int i = 0; i < size; ++i) { + probe = tests[i]; + for (int k = 0; k < extraIterations; ++k) { + oldCol.getCollationKey(probe); + counter++; + } + } + endTime = System.currentTimeMillis(); + oldDelta = delta = (1000*(endTime - startTime) / counter) - overhead; + System.out.println("Old sort key time: " + nf.format(delta) + + " micros (" + counter + " iterations)"); + + // Sort Key: new time + + counter = 0; + startTime = System.currentTimeMillis(); + + for (int i = 0; i < size; ++i) { + probe = tests[i]; + for (int k = 0; k < extraIterations; ++k) { + newCol.getCollationKey(probe); + counter++; + } + } + endTime = System.currentTimeMillis(); + delta = (1000*(endTime - startTime) / counter) - overhead; + System.out.println("New sort key time: " + nf.format(delta) + + " micros (" + counter + " iterations) " + percent.format(delta/oldDelta)); + System.out.println(); + + // ================================================ + // Raw Compare + + if (DO_RAW) { + // get overhead time + counter = 0; + startTime = System.currentTimeMillis(); + int opt = 0; // to keep the compiler from optimizing out + + for (int i = 0; i < size; ++i) { + probe = tests[i]; + for (int j = 0; j < size; ++j) { + opt ^= probe.compareTo(tests[j]); + counter++; + } + } + endTime = System.currentTimeMillis(); + overhead = (1000*(endTime - startTime) / counter); + System.out.println("overhead: " + nf.format((endTime - startTime) / counter) + " micros"); + + // Raw Compare: old time + + counter = 0; + startTime = System.currentTimeMillis(); + + for (int i = 0; i < size; ++i) { + probe = tests[i]; + for (int j = 0; j < size; ++j) { + opt ^= oldCol.compare(probe, tests[j]); + counter++; + } + } + endTime = System.currentTimeMillis(); + oldDelta = delta = (1000*(endTime - startTime) / counter) - overhead; + System.out.println("Old raw compare time: " + nf.format(delta) + + " micros (" + counter + " iterations)"); + + // Raw Compare: new time + + counter = 0; + startTime = System.currentTimeMillis(); + + for (int i = 0; i < size; ++i) { + probe = tests[i]; + for (int j = 0; j < size; ++j) { + opt ^= newCol.compare(probe, tests[j]); + counter++; + } + } + endTime = System.currentTimeMillis(); + delta = (1000*(endTime - startTime) / counter) - overhead; + System.out.println("New raw compare time: " + nf.format(delta) + + " micros (" + counter + " iterations) " + percent.format(delta/oldDelta)); + System.out.println(); + } + + // ================================================ + // Binary Search + // note: I don't worry about getting the binary search precisely right, since I just want to + // see which strings would get compared. + + // overhead + + int iterations = (size * extraIterations); + startTime = System.currentTimeMillis(); + Arrays.sort(tests); + int opt2 = 0; // keep from optimizing out + + for (int i = 0; i < size; ++i) { + probe = tests[i]; + for (int k = 0; k < extraIterations; ++k) { + opt2 ^= Arrays.binarySearch(tests, probe); + } + } + endTime = System.currentTimeMillis(); + overhead = delta = (1000*(endTime - startTime) / iterations); + System.out.println("Overhead: " + nf.format(delta) + + " micros (" + iterations + " iterations)"); + + // old time + + startTime = System.currentTimeMillis(); + Arrays.sort(tests, oldCol); + + for (int i = 0; i < size; ++i) { + probe = tests[i]; + for (int k = 0; k < extraIterations; ++k) { + opt2 ^= Arrays.binarySearch(tests, probe, oldCol); + } + } + endTime = System.currentTimeMillis(); + oldDelta = delta = (1000*(endTime - startTime) / iterations) - overhead; + System.out.println("Old binary search time: " + nf.format(delta) + + " micros (" + iterations + " iterations)"); + + + // new time + + Arrays.sort(tests, newCol); + + startTime = System.currentTimeMillis(); + + for (int i = 0; i < size; ++i) { + probe = tests[i]; + for (int k = 0; k < extraIterations; ++k) { + opt2 ^= Arrays.binarySearch(tests, probe, newCol); + } + } + endTime = System.currentTimeMillis(); + delta = (1000*(endTime - startTime) / iterations) - overhead; + System.out.println("New binary search time: " + nf.format(delta) + + " micros (" + iterations + " iterations) " + percent.format(delta/oldDelta)); + System.out.println(); + + // ================================================ + // Sort + + String[] sortTests = (String[]) tests.clone(); + extraIterations = 5; + iterations = (size * extraIterations); + + // overhead + + startTime = System.currentTimeMillis(); + + for (int i = 0; i < size; ++i) { + for (int k = 0; k < extraIterations; ++k) { + System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array + Arrays.sort(sortTests); + } + } + endTime = System.currentTimeMillis(); + overhead = delta = (1000*(endTime - startTime) / iterations); + System.out.println("overhead: " + nf.format(delta) + + " micros (" + iterations + " iterations)"); + + // old time + + startTime = System.currentTimeMillis(); + + for (int i = 0; i < size; ++i) { + for (int k = 0; k < extraIterations; ++k) { + System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array + Arrays.sort(sortTests, oldCol); + } + } + endTime = System.currentTimeMillis(); + oldDelta = delta = (1000*(endTime - startTime) / iterations) - overhead; + System.out.println("Old sort time: " + nf.format(delta) + + " micros (" + iterations + " iterations)"); + + // new time + + startTime = System.currentTimeMillis(); + + for (int i = 0; i < size; ++i) { + for (int k = 0; k < extraIterations; ++k) { + System.arraycopy(tests, 0, sortTests, 0, tests.length); // copy array + Arrays.sort(sortTests, newCol); + } + } + endTime = System.currentTimeMillis(); + delta = (1000*(endTime - startTime) / iterations) - overhead; + System.out.println("New sort time: " + nf.format(delta) + + " micros (" + iterations + " iterations) " + percent.format(delta/oldDelta)); + + } +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java index 68b630d8457..d715640e12a 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Main.java +++ b/tools/unicodetools/com/ibm/text/UCD/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ -* $Date: 2002/08/04 21:38:45 $ -* $Revision: 1.20 $ +* $Date: 2002/08/08 15:35:01 $ +* $Revision: 1.21 $ * ******************************************************************************* */ @@ -79,6 +79,7 @@ public final class Main implements UCD_Types { else if (arg.equalsIgnoreCase("breaktest")) GenerateBreakTest.main(null); + else if (arg.equalsIgnoreCase("checkcollator")) CheckCollator.main(null); else if (arg.equalsIgnoreCase("genSplit")) GenerateData.genSplit(); else if (arg.equalsIgnoreCase("iana")) IANANames.testSensitivity();