ICU-1682 fixed exhaustive test.

Added 20% optimization (doesn't try to permute class zeros)

X-SVN-Rev: 8100
This commit is contained in:
Mark Davis 2002-03-19 00:18:44 +00:00
parent d55cb3d234
commit 5394fe42d1
2 changed files with 155 additions and 29 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/normalizer/TestCanonicalIterator.java,v $
* $Date: 2002/03/14 22:43:03 $
* $Revision: 1.6 $
* $Date: 2002/03/19 00:18:44 $
* $Revision: 1.7 $
*
*****************************************************************************************
*/
@ -41,7 +41,18 @@ public class TestCanonicalIterator extends TestFmwk {
public void TestExhaustive() {
int counter = 0;
int mixedCounter = 0;
int lastMixedCounter = -1;
CanonicalIterator it = new CanonicalIterator("");
/*
CanonicalIterator slowIt = new CanonicalIterator("");
slowIt.SKIP_ZEROS = false;
*/
Transliterator name = Transliterator.getInstance("[^\\u0020-\\u007F] name");
Set itSet = new TreeSet();
Set slowItSet = new TreeSet();
for (int i = 0; i < 0x10FFFF; ++i) {
// skip characters we know don't have decomps
@ -65,16 +76,99 @@ public class TestCanonicalIterator extends TestFmwk {
while (true) {
String item = it.next();
if (item == null) break;
if (!item.equals(s)) gotSource = true;
if (!item.equals(decomp)) gotDecomp = true;
if (!item.equals(comp)) gotComp = true;
if (item.equals(s)) gotSource = true;
if (item.equals(decomp)) gotDecomp = true;
if (item.equals(comp)) gotComp = true;
if ((mixedCounter & 0x7F) == 0 && (i < 0xAD00 || i > 0xAC00 + 11172)) {
if (lastMixedCounter != mixedCounter) {
logln("");
lastMixedCounter = mixedCounter;
}
logln("\t" + mixedCounter + "\t" + name.transliterate(item)
+ (item.equals(s) ? "\t(*original*)" : "")
+ (item.equals(decomp) ? "\t(*decomp*)" : "")
+ (item.equals(comp) ? "\t(*comp*)" : "")
);
}
}
// check that zeros optimization doesn't mess up.
/*
if (true) {
it.reset();
itSet.clear();
while (true) {
String item = it.next();
if (item == null) break;
itSet.add(item);
}
slowIt.setSource(s);
slowItSet.clear();
while (true) {
String item = slowIt.next();
if (item == null) break;
slowItSet.add(item);
}
if (!itSet.equals(slowItSet)) {
errln("Zero optimization failure with " + getReadable(s));
}
}
*/
mixedCounter++;
if (!gotSource || !gotDecomp || !gotComp) {
errln("FAIL CanonicalIterator: " + s);
}
}
}
public int TestSpeed() {
// skip unless verbose
if (!isVerbose()) return 0;
String s = "\uAC01\u0345";
CanonicalIterator it = new CanonicalIterator(s);
double start, end;
int x = 0; // just to keep code from optimizing away.
int iterations = 10000;
double slowDelta = 0;
/*
CanonicalIterator slowIt = new CanonicalIterator(s);
slowIt.SKIP_ZEROS = false;
start = System.currentTimeMillis();
for (int i = 0; i < iterations; ++i) {
slowIt.setSource(s);
while (true) {
String item = slowIt.next();
if (item == null) break;
x += item.length();
}
}
end = System.currentTimeMillis();
double slowDelta = (end-start) / iterations;
logln("Slow iteration: " + slowDelta);
*/
start = System.currentTimeMillis();
for (int i = 0; i < iterations; ++i) {
it.setSource(s);
while (true) {
String item = it.next();
if (item == null) break;
x += item.length();
}
}
end = System.currentTimeMillis();
double fastDelta = (end-start) / iterations;
logln("Fast iteration: " + fastDelta + (slowDelta != 0 ? ", " + (fastDelta/slowDelta) : ""));
return x;
}
public void TestBasic() {
// check build
UnicodeSet ss = CanonicalIterator.getSafeStart();
@ -88,7 +182,9 @@ public class TestCanonicalIterator extends TestFmwk {
// check permute
// NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
expectEqual("Simple permutation ", "", collectionToString(new TreeSet(CanonicalIterator.permute("ABC"))), "ABC, ACB, BAC, BCA, CAB, CBA");
Set results = new TreeSet();
CanonicalIterator.permute("ABC", false, results);
expectEqual("Simple permutation ", "", collectionToString(results), "ABC, ACB, BAC, BCA, CAB, CBA");
// try samples
SortedSet set = new TreeSet();

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CanonicalIterator.java,v $
* $Date: 2002/03/14 22:43:03 $
* $Revision: 1.7 $
* $Date: 2002/03/19 00:18:44 $
* $Revision: 1.8 $
*
*****************************************************************************************
*/
@ -144,40 +144,50 @@ public class CanonicalIterator {
* Simple implementation of permutation.
*<br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
* @param source the string to find permutations for
* @return the results in a set.
* @param the set to add the results to
* @internal
*/
public static Set permute(String source) {
public static void permute(String source, boolean skipZeros, Set output) {
// TODO: optimize
//if (PROGRESS) System.out.println("Permute: " + source);
Set result = new TreeSet();
// optimization:
// if zero or one character, just return a set with it
// we check for length < 2 to keep from counting code points all the time
if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
result.add(source);
return result;
output.add(source);
return;
}
// otherwise iterate through the string, and recursively permute all the other characters
Set subpermute = new HashSet();
int cp;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
String chStr = UTF16.valueOf(source, i);
// optimization:
// if the character is canonical combining class zero,
// don't permute it
if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
//System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
continue;
}
// see what the permutations of the characters before and after this one are
Set subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
subpermute.clear();
permute(source.substring(0,i)
+ source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
// prefix this character to all of them
String chStr = UTF16.valueOf(source, i);
Iterator it = subpermute.iterator();
while (it.hasNext()) {
String piece = chStr + (String) it.next();
//if (PROGRESS) System.out.println(" Piece: " + piece);
result.add(piece);
output.add(piece);
}
}
return result;
return;
}
// FOR TESTING
@ -206,6 +216,7 @@ public class CanonicalIterator {
// debug
private static boolean PROGRESS = false; // debug progress
private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
private static boolean SKIP_ZEROS = true;
// fields
private String source;
@ -222,8 +233,9 @@ public class CanonicalIterator {
// we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
private String[] getEquivalents(String segment) {
Set result = new TreeSet();
Set result = new HashSet();
Set basic = getEquivalents2(segment);
Set permutations = new HashSet();
// now get all the permutations
// add only the ones that are canonically equivalent
@ -231,16 +243,24 @@ public class CanonicalIterator {
Iterator it = basic.iterator();
while (it.hasNext()) {
String item = (String) it.next();
Set permutations = permute(item);
permutations.clear();
permute(item, SKIP_ZEROS, permutations);
Iterator it2 = permutations.iterator();
while (it2.hasNext()) {
String possible = (String) it2.next();
String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
if (attempt.equals(segment)) {
if (PROGRESS) System.out.println("Adding Permutation: " + NAME.transliterate(possible));
result.add(possible);
String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
if (attempt.equals(segment)) {
if (PROGRESS) System.out.println("Adding Permutation: " + NAME.transliterate(possible));
result.add(possible);
/*
if (isEquivalent(possible, Normalizer.DECOMP)) {
if (PROGRESS) System.out.println("Adding Permutation: " + NAME.transliterate(possible));
result.add(possible);
*/
} else {
if (PROGRESS) System.out.println("-Skipping Permutation: " + NAME.transliterate(possible));
if (PROGRESS) System.out.println("-Skipping Permutation: " + NAME.transliterate(possible));
}
}
}
@ -251,8 +271,16 @@ public class CanonicalIterator {
return finalResult;
}
/**
* TODO: Should be method on Normalizer; and MUCH faster
*/
private boolean isEquivalent(String possible, Normalizer.Mode normalizerType) {
return possible.equals(Normalizer.normalize(possible, normalizerType, 0));
}
private Set getEquivalents2(String segment) {
Set result = new TreeSet();
Set result = new HashSet();
if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
result.add(segment);
StringBuffer workingBuffer = new StringBuffer();
@ -349,17 +377,19 @@ public class CanonicalIterator {
return getEquivalents2(remainder);
}
/*
// TODO: fix once we have a codepoint interface to get the canonical combining class
// TODO: Need public access to canonical combining class in UCharacter!
private static int getClass(int cp) {
return Normalizer.getClass((char)cp);
}
*/
// ================= BUILDER =========================
// TODO: Flatten this data so it doesn't have to be reconstructed each time!
private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
private static final Set SET_WITH_NULL_STRING = new TreeSet(); // constant, don't change
private static final Set SET_WITH_NULL_STRING = new HashSet(); // constant, don't change
static {
SET_WITH_NULL_STRING.add("");
}
@ -367,7 +397,7 @@ public class CanonicalIterator {
private static UnicodeSet SAFE_START = new UnicodeSet();
private static CharMap AT_START = new CharMap();
// WARNING, NORMALIZER doesn't have supplementaries yet;
// TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
// Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
private static int LAST_UNICODE = 0xFFFF;
static {
@ -380,7 +410,7 @@ public class CanonicalIterator {
if (PROGRESS) System.out.println("Getting Safe Start");
for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
int cc = getClass(cp);
int cc = UCharacter.getCombiningClass(cp);
if (cc == 0) SAFE_START.add(cp);
// will fix to be really safe below
}
@ -404,7 +434,7 @@ public class CanonicalIterator {
component = UTF16.charAt(decomp, i);
if (i == 0) {
AT_START.add(component, cp);
} else if (getClass(component) == 0) {
} else if (UCharacter.getCombiningClass(component) == 0) {
SAFE_START.remove(component);
}
}