diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java index d14c3046534..74f6639aaf6 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java @@ -9,6 +9,7 @@ package com.ibm.icu.text; import java.io.IOException; import java.text.ParsePosition; import java.util.Collection; +import java.util.Comparator; import java.util.Iterator; import java.util.MissingResourceException; import java.util.TreeSet; @@ -260,7 +261,7 @@ import com.ibm.icu.util.VersionInfo; * @stable ICU 2.0 * @see UnicodeSetIterator */ -public class UnicodeSet extends UnicodeFilter implements Freezable { +public class UnicodeSet extends UnicodeFilter implements Iterable, Comparable, Freezable { private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. @@ -1045,6 +1046,18 @@ public class UnicodeSet extends UnicodeFilter implements Freezable { return add_unchecked(start, end); } + /** + * Adds all characters in range (uses preferred naming convention). + * @param start + * @param end + * @return + * @draft ICU 4.2 + */ + public UnicodeSet addAll(int start, int end) { + checkFrozen(); + return add_unchecked(start, end); + } + // for internal use, after checkFrozen has been called private UnicodeSet add_unchecked(int start, int end) { if (start < MIN_VALUE || start > MAX_VALUE) { @@ -2666,25 +2679,35 @@ public class UnicodeSet extends UnicodeFilter implements Freezable { /** * Add the contents of the UnicodeSet (as strings) into a collection. * @param target collection to add into + * @return * @stable ICU 2.8 */ - public void addAllTo(Collection target) { - UnicodeSetIterator it = new UnicodeSetIterator(this); - while (it.next()) { - target.add(it.getString()); - } + public > U addAllTo(U target) { + return addAllTo(this, target); } /** * Add the contents of the collection (as strings) into this UnicodeSet. * @param source the collection to add + * @return * @stable ICU 2.8 */ - public void addAll(Collection source) { + public UnicodeSet add(Collection source) { + return addAll(source); + } + + /** + * Add the contents of the UnicodeSet (as strings) into a collection. Uses standard naming convention. + * @param target collection to add into + * @return + * @draft ICU 4.2 + */ + public UnicodeSet addAll(Collection source) { checkFrozen(); for (Object o : source) { add(o.toString()); } + return this; } //---------------------------------------------------------------- @@ -3742,7 +3765,7 @@ public class UnicodeSet extends UnicodeFilter implements Freezable { * @return this * @stable ICU 3.8 */ - public Object freeze() { + public UnicodeSet freeze() { frozen = true; return this; } @@ -3764,5 +3787,292 @@ public class UnicodeSet extends UnicodeFilter implements Freezable { throw new UnsupportedOperationException("Attempt to modify frozen object"); } } + + // ************************ + // Additional methods for integration with Generics and Collections + // ************************ + + /** + * Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}. + * @see java.util.Set#iterator() + * @draft ICU 4.2 + */ + public Iterator iterator() { + return new UnicodeSetIterator2(this); + } + + // Cover for string iteration. + private static class UnicodeSetIterator2 implements Iterator { + // Invariants: + // sourceList != null then sourceList[item] is a valid character + // sourceList == null then delegates to stringIterator + private int[] sourceList; + private int len; + private int item; + private int current; + private int limit; + private TreeSet sourceStrings; + private Iterator stringIterator; + private char[] buffer; + + UnicodeSetIterator2(UnicodeSet source) { + // set according to invariants + len = source.len - 1; + if (item >= len) { + stringIterator = source.strings.iterator(); + sourceList = null; + } else { + sourceStrings = source.strings; + sourceList = source.list; + current = sourceList[item++]; + limit = sourceList[item++]; + } + } + + /* (non-Javadoc) + * @see java.util.Iterator#hasNext() + */ + public boolean hasNext() { + return sourceList != null || stringIterator.hasNext(); + } + + /* (non-Javadoc) + * @see java.util.Iterator#next() + */ + public String next() { + if (sourceList == null) { + return stringIterator.next(); + } + int codepoint = current++; + // we have the codepoint we need, but we may need to adjust the state + if (current >= limit) { + if (item >= len) { + stringIterator = sourceStrings.iterator(); + sourceList = null; + } else { + current = sourceList[item++]; + limit = sourceList[item++]; + } + } + // Now return. Single code point is easy + if (codepoint <= 0xFFFF) { + return String.valueOf((char)codepoint); + } + // But Java lacks a valueOfCodePoint, so we handle ourselves for speed + // allocate a buffer the first time, to make conversion faster. + if (buffer == null) { + buffer = new char[2]; + } + // compute ourselves, to save tests and calls + int offset = codepoint - Character.MIN_SUPPLEMENTARY_CODE_POINT; + buffer[0] = (char)((offset & 0x3ff) + Character.MIN_LOW_SURROGATE); + buffer[1] = (char)((offset >>> 10) + Character.MIN_HIGH_SURROGATE); + return String.valueOf(buffer); + } + + /* (non-Javadoc) + * @see java.util.Iterator#remove() + */ + public void remove() { + throw new UnsupportedOperationException(); + } + } + + /** + * @see containsAll(com.ibm.icu.text.UnicodeSet) + * @draft ICU 4.2 + */ + public boolean containsAll(Collection collection) { + for (String o : collection) { + if (!contains(o)) { + return false; + } + } + return true; + } + + /** + * @see #containsNone(com.ibm.icu.text.UnicodeSet) + * @draft ICU 4.2 + */ + public boolean containsNone(Collection collection) { + for (String o : collection) { + if (contains(o)) { + return false; + } + } + return true; + } + + /** + * @see #containsAll(com.ibm.icu.text.UnicodeSet) + * @draft ICU 4.2 + */ + public final boolean containsSome(Collection collection) { + return !containsNone(collection); + } + + /** + * @see #addAll(com.ibm.icu.text.UnicodeSet) + * @draft ICU 4.2 + */ + public UnicodeSet addAll(String... collection) { + checkFrozen(); + for (String str : collection) { + add(str); + } + return this; + } + + + /** + * @see #removeAll(com.ibm.icu.text.UnicodeSet) + * @draft ICU 4.2 + */ + public UnicodeSet removeAll(Collection collection) { + checkFrozen(); + for (String o : collection) { + remove(o); + } + return this; + } + + /** + * @see #retainAll(com.ibm.icu.text.UnicodeSet) + * @draft ICU 4.2 + */ + public UnicodeSet retainAll(Collection collection) { + checkFrozen(); + // TODO optimize + UnicodeSet toRetain = new UnicodeSet(); + toRetain.addAll(collection); + retainAll(toRetain); + return this; + } + + /* (non-Javadoc) + * @see java.lang.Comparable#compareTo(java.lang.Object) + * @draft ICU 4.2 + */ + public int compareTo(UnicodeSet o) { + int result; + for (int i = 0; ; ++i) { + if (0 != (result = list[i] - o.list[i])) { + // if either list ran out, compare to the last string + if (list[i] == HIGH) { + if (strings.isEmpty()) return 1; + String item = strings.first(); + return compare(item, o.list[i]); + } + if (o.list[i] == HIGH) { + if (o.strings.isEmpty()) return -1; + String item = o.strings.first(); + return -compare(item, list[i]); + } + // otherwise return the result if even index, or the reversal if not + return (i & 1) == 0 ? result : -result; + } + if (list[i] == HIGH) { + break; + } + } + return compare(strings, o.strings); + } + + /* (non-Javadoc) + * @see java.lang.Comparable#compareTo(java.lang.Object) + * @draft ICU 4.2 + */ + public int compareTo(Iterable other) { + return compare(this, other); + } + + /** + * Utility to compare a string to a code point. + * Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString()) + * and comparing, but much faster (no object creation). + * Note that this (=String) order is UTF-16 order -- *not* code point order. + * @draft ICU 4.2 + */ + public static int compare(String string, int codePoint) { + if (codePoint < Character.MIN_CODE_POINT || codePoint > Character.MAX_CODE_POINT) { + throw new IllegalArgumentException(); + } + int stringLength = string.length(); + if (stringLength == 0) { + return -1; + } + char firstChar = string.charAt(0); + int offset = codePoint - Character.MIN_SUPPLEMENTARY_CODE_POINT; + + if (offset < 0) { // BMP codePoint + int result = firstChar - codePoint; + if (result != 0) { + return result; + } + return stringLength - 1; + } + // non BMP + char lead = (char)((offset >>> 10) + Character.MIN_HIGH_SURROGATE); + int result = firstChar - lead; + if (result != 0) { + return result; + } + if (stringLength > 1) { + char trail = (char)((offset & 0x3ff) + Character.MIN_LOW_SURROGATE); + result = string.charAt(1) - trail; + if (result != 0) { + return result; + } + } + return stringLength - 2; + } + + /** + * Utility to compare a string to a code point. + * Same results as turning the code point into a string and comparing, but much faster (no object creation). + * Actually, there is one difference; a null compares as less. + * @draft ICU 4.2 + */ + public static int compare(int codepoint, String a) { + return -compare(a, codepoint); + } + + /** + * Utility to compare two collections of iterables. Warning: the ordering in iterables is important. For Collections that are ordered, + * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. + * That means that sets can't be compared directly with this method, unless they are TreeSets without + * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of + * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. + * @draft ICU 4.2 + */ + public static > int compare(Iterable collection1, Iterable collection2) { + Iterator first = collection1.iterator(); + Iterator other = collection2.iterator(); + while (true) { + if (!first.hasNext()) { + return other.hasNext() ? -1 : 0; + } else if (!other.hasNext()) { + return 1; + } + T item1 = first.next(); + T item2 = other.next(); + int result = item1.compareTo(item2); + if (result != 0) { + return result; + } + } + } + + /** + * Utility for adding the contents of an iterable to a collection. + * @draft ICU 4.2 + */ + public static > U addAllTo(Iterable source, U target) { + for (T item : source) { + target.add(item); + } + return target; + } } //eof diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java index af829ff83c3..ac96e17729d 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java @@ -13,6 +13,7 @@ import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import java.util.SortedSet; @@ -36,280 +37,280 @@ import com.ibm.icu.text.UnicodeSetIterator; * @summary General test of UnicodeSet */ public class UnicodeSetTest extends TestFmwk { - - static final String NOT = "%%%%"; - - public static void main(String[] args) throws Exception { - new UnicodeSetTest().run(args); - } - private static final boolean isCccValue(int ccc) { - switch (ccc) { - case 0: - case 1: - case 7: - case 8: - case 9: - case 200: - case 202: - case 216: - case 218: - case 220: - case 222: - case 224: - case 226: - case 228: - case 230: - case 232: - case 233: - case 234: - case 240: - return true; - default: - return false; + static final String NOT = "%%%%"; + + public static void main(String[] args) throws Exception { + new UnicodeSetTest().run(args); } - } - public void TestPropertyAccess() { - int count = 0; - // test to see that all of the names work - for (int propNum = UProperty.BINARY_START; propNum < UProperty.INT_LIMIT; ++propNum) { - count++; - //Skipping tests in the non-exhaustive mode to shorten the test time ticket#6475 - if(getInclusion()<=5 && count%5!=0){ - continue; - } - if (propNum >= UProperty.BINARY_LIMIT && propNum < UProperty.INT_START) { // skip the gap - propNum = UProperty.INT_START; - } - for (int nameChoice = UProperty.NameChoice.SHORT; nameChoice <= UProperty.NameChoice.LONG; ++nameChoice) { - String propName; - try { - propName = UCharacter.getPropertyName(propNum, nameChoice); - if (propName == null) { - if (nameChoice == UProperty.NameChoice.SHORT) continue; // allow non-existent short names - throw new NullPointerException(); - } - } catch (RuntimeException e1) { - errln("Can't get property name for: " - + "Property (" + propNum + ")" - + ", NameChoice: " + nameChoice + ", " - + e1.getClass().getName()); - continue; + private static final boolean isCccValue(int ccc) { + switch (ccc) { + case 0: + case 1: + case 7: + case 8: + case 9: + case 200: + case 202: + case 216: + case 218: + case 220: + case 222: + case 224: + case 226: + case 228: + case 230: + case 232: + case 233: + case 234: + case 240: + return true; + default: + return false; } - logln("Property (" + propNum + "): " + propName); - for (int valueNum = UCharacter.getIntPropertyMinValue(propNum); valueNum <= UCharacter.getIntPropertyMaxValue(propNum); ++valueNum) { - String valueName; - try { - valueName = UCharacter.getPropertyValueName(propNum, valueNum, nameChoice); - if (valueName == null) { - if (nameChoice == UProperty.NameChoice.SHORT) continue; // allow non-existent short names - if ((propNum == UProperty.CANONICAL_COMBINING_CLASS || - propNum == UProperty.LEAD_CANONICAL_COMBINING_CLASS || - propNum == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) && - !isCccValue(valueNum)) { - // Only a few of the canonical combining classes have names. - // Otherwise they are just integer values. + } + + public void TestPropertyAccess() { + int count = 0; + // test to see that all of the names work + for (int propNum = UProperty.BINARY_START; propNum < UProperty.INT_LIMIT; ++propNum) { + count++; + //Skipping tests in the non-exhaustive mode to shorten the test time ticket#6475 + if(getInclusion()<=5 && count%5!=0){ continue; - } else { - throw new NullPointerException(); - } } - } catch (RuntimeException e1) { - errln("Can't get property value name for: " - + "Property (" + propNum + "): " + propName + ", " - + "Value (" + valueNum + ") " - + ", NameChoice: " + nameChoice + ", " - + e1.getClass().getName()); - continue; - } - logln("Value (" + valueNum + "): " + valueName); - UnicodeSet testSet; - try { - testSet = new UnicodeSet("[:" + propName + "=" + valueName + ":]"); - } catch (RuntimeException e) { - errln("Can't create UnicodeSet for: " - + "Property (" + propNum + "): " + propName + ", " - + "Value (" + valueNum + "): " + valueName + ", " - + e.getClass().getName()); - continue; - } - UnicodeSet collectedErrors = new UnicodeSet(); - for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.next();) { - int value = UCharacter.getIntPropertyValue(it.codepoint, propNum); - if (value != valueNum) { - collectedErrors.add(it.codepoint); + if (propNum >= UProperty.BINARY_LIMIT && propNum < UProperty.INT_START) { // skip the gap + propNum = UProperty.INT_START; } - } - if (collectedErrors.size() != 0) { - errln("Property Value Differs: " - + "Property (" + propNum + "): " + propName + ", " - + "Value (" + valueNum + "): " + valueName + ", " - + "Differing values: " + collectedErrors.toPattern(true)); - } + for (int nameChoice = UProperty.NameChoice.SHORT; nameChoice <= UProperty.NameChoice.LONG; ++nameChoice) { + String propName; + try { + propName = UCharacter.getPropertyName(propNum, nameChoice); + if (propName == null) { + if (nameChoice == UProperty.NameChoice.SHORT) continue; // allow non-existent short names + throw new NullPointerException(); + } + } catch (RuntimeException e1) { + errln("Can't get property name for: " + + "Property (" + propNum + ")" + + ", NameChoice: " + nameChoice + ", " + + e1.getClass().getName()); + continue; + } + logln("Property (" + propNum + "): " + propName); + for (int valueNum = UCharacter.getIntPropertyMinValue(propNum); valueNum <= UCharacter.getIntPropertyMaxValue(propNum); ++valueNum) { + String valueName; + try { + valueName = UCharacter.getPropertyValueName(propNum, valueNum, nameChoice); + if (valueName == null) { + if (nameChoice == UProperty.NameChoice.SHORT) continue; // allow non-existent short names + if ((propNum == UProperty.CANONICAL_COMBINING_CLASS || + propNum == UProperty.LEAD_CANONICAL_COMBINING_CLASS || + propNum == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) && + !isCccValue(valueNum)) { + // Only a few of the canonical combining classes have names. + // Otherwise they are just integer values. + continue; + } else { + throw new NullPointerException(); + } + } + } catch (RuntimeException e1) { + errln("Can't get property value name for: " + + "Property (" + propNum + "): " + propName + ", " + + "Value (" + valueNum + ") " + + ", NameChoice: " + nameChoice + ", " + + e1.getClass().getName()); + continue; + } + logln("Value (" + valueNum + "): " + valueName); + UnicodeSet testSet; + try { + testSet = new UnicodeSet("[:" + propName + "=" + valueName + ":]"); + } catch (RuntimeException e) { + errln("Can't create UnicodeSet for: " + + "Property (" + propNum + "): " + propName + ", " + + "Value (" + valueNum + "): " + valueName + ", " + + e.getClass().getName()); + continue; + } + UnicodeSet collectedErrors = new UnicodeSet(); + for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.next();) { + int value = UCharacter.getIntPropertyValue(it.codepoint, propNum); + if (value != valueNum) { + collectedErrors.add(it.codepoint); + } + } + if (collectedErrors.size() != 0) { + errln("Property Value Differs: " + + "Property (" + propNum + "): " + propName + ", " + + "Value (" + valueNum + "): " + valueName + ", " + + "Differing values: " + collectedErrors.toPattern(true)); + } + } + } } - } } - } - - - /** - * Test toPattern(). - */ - public void TestToPattern() throws Exception { - // Test that toPattern() round trips with syntax characters - // and whitespace. - for (int i = 0; i < OTHER_TOPATTERN_TESTS.length; ++i) { - checkPat(OTHER_TOPATTERN_TESTS[i], new UnicodeSet(OTHER_TOPATTERN_TESTS[i])); - } - for (int i = 0; i <= 0x10FFFF; ++i) { - if ((i <= 0xFF && !UCharacter.isLetter(i)) || UCharacter.isWhitespace(i)) { - // check various combinations to make sure they all work. - if (i != 0 && !toPatternAux(i, i)) continue; - if (!toPatternAux(0, i)) continue; - if (!toPatternAux(i, 0xFFFF)) continue; - } - } - - // Test pattern behavior of multicharacter strings. - UnicodeSet s = new UnicodeSet("[a-z {aa} {ab}]"); - expectToPattern(s, "[a-z{aa}{ab}]", - new String[] {"aa", "ab", NOT, "ac"}); - s.add("ac"); - expectToPattern(s, "[a-z{aa}{ab}{ac}]", - new String[] {"aa", "ab", "ac", NOT, "xy"}); - - s.applyPattern("[a-z {\\{l} {r\\}}]"); - expectToPattern(s, "[a-z{r\\}}{\\{l}]", - new String[] {"{l", "r}", NOT, "xy"}); - s.add("[]"); - expectToPattern(s, "[a-z{\\[\\]}{r\\}}{\\{l}]", - new String[] {"{l", "r}", "[]", NOT, "xy"}); - - s.applyPattern("[a-z {\u4E01\u4E02}{\\n\\r}]"); - expectToPattern(s, "[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", - new String[] {"\u4E01\u4E02", "\n\r"}); - - s.clear(); - s.add("abc"); - s.add("abc"); - expectToPattern(s, "[{abc}]", - new String[] {"abc", NOT, "ab"}); - - // JB#3400: For 2 character ranges prefer [ab] to [a-b] - s.clear(); - s.add('a', 'b'); - expectToPattern(s, "[ab]", null); - - // Cover applyPattern, applyPropertyAlias - s.clear(); - s.applyPattern("[ab ]", true); - expectToPattern(s, "[ab]", new String[] {"a", NOT, "ab", " "}); - s.clear(); - s.applyPattern("[ab ]", false); - expectToPattern(s, "[\\ ab]", new String[] {"a", "\u0020", NOT, "ab"}); - - s.clear(); - s.applyPropertyAlias("nv", "0.5"); - expectToPattern(s, "[\\u00BD\\u0D74\\u0F2A\\u2CFD\\U00010141\\U00010175\\U00010176]", null); - // Unicode 5.1 adds Malayalam 1/2 (\u0D74) - - s.clear(); - s.applyPropertyAlias("gc", "Lu"); - // TODO expectToPattern(s, what?) - // RemoveAllStrings() - s.clear(); - s.applyPattern("[a-z{abc}{def}]"); - expectToPattern(s, "[a-z{abc}{def}]", null); - s.removeAllStrings(); - expectToPattern(s, "[a-z]", null); - } - - static String[] OTHER_TOPATTERN_TESTS = { - "[[:latin:]&[:greek:]]", - "[[:latin:]-[:greek:]]", - "[:nonspacing mark:]" - }; - - - public boolean toPatternAux(int start, int end) { - // use Integer.toString because Utility.hex doesn't handle ints - String source = "0x" + Integer.toString(start,16).toUpperCase(); - if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase(); - UnicodeSet testSet = new UnicodeSet(); - testSet.add(start, end); - return checkPat(source, testSet); - } - - boolean checkPat (String source, UnicodeSet testSet) { - String pat = ""; - try { - // What we want to make sure of is that a pattern generated - // by toPattern(), with or without escaped unprintables, can - // be passed back into the UnicodeSet constructor. - String pat0 = testSet.toPattern(true); - if (!checkPat(source + " (escaped)", testSet, pat0)) return false; - - //String pat1 = unescapeLeniently(pat0); - //if (!checkPat(source + " (in code)", testSet, pat1)) return false; - - String pat2 = testSet.toPattern(false); - if (!checkPat(source, testSet, pat2)) return false; - - //String pat3 = unescapeLeniently(pat2); - //if (!checkPat(source + " (in code)", testSet, pat3)) return false; - - //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3); - logln(source + " => " + pat0 + ", " + pat2); - } catch (Exception e) { - errln("EXCEPTION in toPattern: " + source + " => " + pat); - return false; + + /** + * Test toPattern(). + */ + public void TestToPattern() throws Exception { + // Test that toPattern() round trips with syntax characters + // and whitespace. + for (int i = 0; i < OTHER_TOPATTERN_TESTS.length; ++i) { + checkPat(OTHER_TOPATTERN_TESTS[i], new UnicodeSet(OTHER_TOPATTERN_TESTS[i])); + } + for (int i = 0; i <= 0x10FFFF; ++i) { + if ((i <= 0xFF && !UCharacter.isLetter(i)) || UCharacter.isWhitespace(i)) { + // check various combinations to make sure they all work. + if (i != 0 && !toPatternAux(i, i)) continue; + if (!toPatternAux(0, i)) continue; + if (!toPatternAux(i, 0xFFFF)) continue; + } + } + + // Test pattern behavior of multicharacter strings. + UnicodeSet s = new UnicodeSet("[a-z {aa} {ab}]"); + expectToPattern(s, "[a-z{aa}{ab}]", + new String[] {"aa", "ab", NOT, "ac"}); + s.add("ac"); + expectToPattern(s, "[a-z{aa}{ab}{ac}]", + new String[] {"aa", "ab", "ac", NOT, "xy"}); + + s.applyPattern("[a-z {\\{l} {r\\}}]"); + expectToPattern(s, "[a-z{r\\}}{\\{l}]", + new String[] {"{l", "r}", NOT, "xy"}); + s.add("[]"); + expectToPattern(s, "[a-z{\\[\\]}{r\\}}{\\{l}]", + new String[] {"{l", "r}", "[]", NOT, "xy"}); + + s.applyPattern("[a-z {\u4E01\u4E02}{\\n\\r}]"); + expectToPattern(s, "[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", + new String[] {"\u4E01\u4E02", "\n\r"}); + + s.clear(); + s.add("abc"); + s.add("abc"); + expectToPattern(s, "[{abc}]", + new String[] {"abc", NOT, "ab"}); + + // JB#3400: For 2 character ranges prefer [ab] to [a-b] + s.clear(); + s.add('a', 'b'); + expectToPattern(s, "[ab]", null); + + // Cover applyPattern, applyPropertyAlias + s.clear(); + s.applyPattern("[ab ]", true); + expectToPattern(s, "[ab]", new String[] {"a", NOT, "ab", " "}); + s.clear(); + s.applyPattern("[ab ]", false); + expectToPattern(s, "[\\ ab]", new String[] {"a", "\u0020", NOT, "ab"}); + + s.clear(); + s.applyPropertyAlias("nv", "0.5"); + expectToPattern(s, "[\\u00BD\\u0D74\\u0F2A\\u2CFD\\U00010141\\U00010175\\U00010176]", null); + // Unicode 5.1 adds Malayalam 1/2 (\u0D74) + + s.clear(); + s.applyPropertyAlias("gc", "Lu"); + // TODO expectToPattern(s, what?) + + // RemoveAllStrings() + s.clear(); + s.applyPattern("[a-z{abc}{def}]"); + expectToPattern(s, "[a-z{abc}{def}]", null); + s.removeAllStrings(); + expectToPattern(s, "[a-z]", null); } - return true; - } - - boolean checkPat (String source, UnicodeSet testSet, String pat) { - UnicodeSet testSet2 = new UnicodeSet(pat); - if (!testSet2.equals(testSet)) { - errln("Fail toPattern: " + source + "; " + pat + " => " + - testSet2.toPattern(false) + ", expected " + - testSet.toPattern(false)); - return false; + + static String[] OTHER_TOPATTERN_TESTS = { + "[[:latin:]&[:greek:]]", + "[[:latin:]-[:greek:]]", + "[:nonspacing mark:]" + }; + + + public boolean toPatternAux(int start, int end) { + // use Integer.toString because Utility.hex doesn't handle ints + String source = "0x" + Integer.toString(start,16).toUpperCase(); + if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase(); + UnicodeSet testSet = new UnicodeSet(); + testSet.add(start, end); + return checkPat(source, testSet); } - return true; - } - - // NOTE: copied the following from Utility. There ought to be a version in there with a flag - // that does the Java stuff - - public static int unescapeAt(String s, int[] offset16) { - int c; - int result = 0; - int n = 0; - int minDig = 0; - int maxDig = 0; - int bitsPerDigit = 4; - int dig; - int i; - - /* Check that offset is in range */ - int offset = offset16[0]; - int length = s.length(); - if (offset < 0 || offset >= length) { - return -1; + + boolean checkPat (String source, UnicodeSet testSet) { + String pat = ""; + try { + // What we want to make sure of is that a pattern generated + // by toPattern(), with or without escaped unprintables, can + // be passed back into the UnicodeSet constructor. + String pat0 = testSet.toPattern(true); + if (!checkPat(source + " (escaped)", testSet, pat0)) return false; + + //String pat1 = unescapeLeniently(pat0); + //if (!checkPat(source + " (in code)", testSet, pat1)) return false; + + String pat2 = testSet.toPattern(false); + if (!checkPat(source, testSet, pat2)) return false; + + //String pat3 = unescapeLeniently(pat2); + //if (!checkPat(source + " (in code)", testSet, pat3)) return false; + + //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3); + logln(source + " => " + pat0 + ", " + pat2); + } catch (Exception e) { + errln("EXCEPTION in toPattern: " + source + " => " + pat); + return false; + } + return true; } - - /* Fetch first UChar after '\\' */ - c = UTF16.charAt(s, offset); - offset += UTF16.getCharCount(c); - - /* Convert hexadecimal and octal escapes */ - switch (c) { - case 'u': - minDig = maxDig = 4; - break; - /* + + boolean checkPat (String source, UnicodeSet testSet, String pat) { + UnicodeSet testSet2 = new UnicodeSet(pat); + if (!testSet2.equals(testSet)) { + errln("Fail toPattern: " + source + "; " + pat + " => " + + testSet2.toPattern(false) + ", expected " + + testSet.toPattern(false)); + return false; + } + return true; + } + + // NOTE: copied the following from Utility. There ought to be a version in there with a flag + // that does the Java stuff + + public static int unescapeAt(String s, int[] offset16) { + int c; + int result = 0; + int n = 0; + int minDig = 0; + int maxDig = 0; + int bitsPerDigit = 4; + int dig; + int i; + + /* Check that offset is in range */ + int offset = offset16[0]; + int length = s.length(); + if (offset < 0 || offset >= length) { + return -1; + } + + /* Fetch first UChar after '\\' */ + c = UTF16.charAt(s, offset); + offset += UTF16.getCharCount(c); + + /* Convert hexadecimal and octal escapes */ + switch (c) { + case 'u': + minDig = maxDig = 4; + break; + /* case 'U': minDig = maxDig = 8; break; @@ -317,1770 +318,1926 @@ public class UnicodeSetTest extends TestFmwk { minDig = 1; maxDig = 2; break; - */ - default: - dig = UCharacter.digit(c, 8); - if (dig >= 0) { - minDig = 1; - maxDig = 3; - n = 1; /* Already have first octal digit */ - bitsPerDigit = 3; - result = dig; - } - break; - } - if (minDig != 0) { - while (offset < length && n < maxDig) { - // TEMPORARY - // TODO: Restore the char32-based code when UCharacter.digit - // is working (Bug 66). - - //c = UTF16.charAt(s, offset); - //dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); - c = s.charAt(offset); - dig = Character.digit((char)c, (bitsPerDigit == 3) ? 8 : 16); - if (dig < 0) { - break; + */ + default: + dig = UCharacter.digit(c, 8); + if (dig >= 0) { + minDig = 1; + maxDig = 3; + n = 1; /* Already have first octal digit */ + bitsPerDigit = 3; + result = dig; + } + break; } - result = (result << bitsPerDigit) | dig; - //offset += UTF16.getCharCount(c); - ++offset; - ++n; - } - if (n < minDig) { - return -1; - } - offset16[0] = offset; - return result; - } - - /* Convert C-style escapes in table */ - for (i=0; i 0 && (filter & SortedSetRelation.A_AND_B) == 0) return false; - - // A - B size == A.size - A&B.size - if (A.size() > ab.size() && (filter & SortedSetRelation.A_NOT_B) == 0) return false; - - // B - A size == B.size - A&B.size - if (B.size() > ab.size() && (filter & SortedSetRelation.B_NOT_A) == 0) return false; - - - return true; - } - - void checkSetRelation(SortedSet a, SortedSet b, String message) { - for (int i = 0; i < 8; ++i) { - - boolean hasRelation = SortedSetRelation.hasRelation(a, i, b); - boolean dumbHasRelation = dumbHasRelation(a, i, b); - - logln(message + " " + hasRelation + ":\t" + a + "\t" + RELATION_NAME[i] + "\t" + b); - - if (hasRelation != dumbHasRelation) { - errln("FAIL: " + - message + " " + dumbHasRelation + ":\t" + a + "\t" + RELATION_NAME[i] + "\t" + b); - } - } - logln(""); - } - - /** - * Test the [:Latin:] syntax. - */ - public void TestScriptSet() { - - expectContainment("[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1")); - - expectContainment("[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA"); - - /* Jitterbug 1423 */ - expectContainment("[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA"); - - } - - /** - * Test the [:Latin:] syntax. - */ - public void TestPropertySet() { - String[] DATA = { - // Pattern, Chars IN, Chars NOT in - - "[:Latin:]", - "aA", - "\u0391\u03B1", - - "[\\p{Greek}]", - "\u0391\u03B1", - "aA", - - "\\P{ GENERAL Category = upper case letter }", - "abc", - "ABC", - - // Combining class: @since ICU 2.2 - // Check both symbolic and numeric - "\\p{ccc=Nukta}", - "\u0ABC", - "abc", - - "\\p{Canonical Combining Class = 11}", - "\u05B1", - "\u05B2", - - "[:c c c = iota subscript :]", - "\u0345", - "xyz", - - // Bidi class: @since ICU 2.2 - "\\p{bidiclass=lefttoright}", - "abc", - "\u0671\u0672", - - // Binary properties: @since ICU 2.2 - "\\p{ideographic}", - "\u4E0A", - "x", - - "[:math=false:]", - "q)*(", // )(and * were removed from math in Unicode 4.0.1 - "+<>^", - - // JB#1767 \N{}, \p{ASCII} - "[:Ascii:]", - "abc\u0000\u007F", - "\u0080\u4E00", - - "[\\N{ latin small letter a }[:name= latin small letter z:]]", - "az", - "qrs", - - // JB#2015 - "[:any:]", - "a\\U0010FFFF", - "", - - "[:nv=0.5:]", - "\u00BD\u0F2A", - "\u00BC", - - // JB#2653: Age - "[:Age=1.1:]", - "\u03D6", // 1.1 - "\u03D8\u03D9", // 3.2 - - "[:Age=3.1:]", - "\\u1800\\u3400\\U0002f800", - "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000", - - // JB#2350: Case_Sensitive - "[:Case Sensitive:]", - "A\u1FFC\\U00010410", - ";\u00B4\\U00010500", - - - // Regex compatibility test - "[-b]", // leading '-' is literal - "-b", - "ac", - - "[^-b]", // leading '-' is literal - "ac", - "-b", - - "[b-]", // trailing '-' is literal - "-b", - "ac", - - "[^b-]", // trailing '-' is literal - "ac", - "-b", - - "[a-b-]", // trailing '-' is literal - "ab-", - "c=", - - "[[a-q]&[p-z]-]", // trailing '-' is literal - "pq-", - "or=", - - "[\\s|\\)|:|$|\\>]", // from regex tests - "s|):$>", - "\\abc", - - "[\uDC00cd]", // JB#2906: isolated trail at start - "cd\uDC00", - "ab\uD800\\U00010000", - - "[ab\uD800]", // JB#2906: isolated trail at start - "ab\uD800", - "cd\uDC00\\U00010000", - - "[ab\uD800cd]", // JB#2906: isolated lead in middle - "abcd\uD800", - "ef\uDC00\\U00010000", - - "[ab\uDC00cd]", // JB#2906: isolated trail in middle - "abcd\uDC00", - "ef\uD800\\U00010000", - - "[:^lccc=0:]", // Lead canonical class - "\u0300\u0301", - "abcd\u00c0\u00c5", - - "[:^tccc=0:]", // Trail canonical class - "\u0300\u0301\u00c0\u00c5", - "abcd", - - "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class - "\u0300\u0301\u00c0\u00c5", - "abcd", - - "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now) - "", - "abcd\u0300\u0301\u00c0\u00c5", - - "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not - "\u0F73\u0F75\u0F81", - "abcd\u0300\u0301\u00c0\u00c5", - - "[:Assigned:]", - "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD", - "\\u0888\\uFDD3\\uFFFE\\U00050005", - - }; - - for (int i=0; i indexOf() => " + set.indexOf(c)); - } - } - int c = set.charAt(set.size()); - if (c != -1) { - errln("FAIL: charAt() = " + - Utility.escape(String.valueOf(c))); - } - int j = set.indexOf('q'); - if (j != -1) { - errln("FAIL: indexOf('q') = " + j); - } - } - - public void TestContainsString() { - UnicodeSet x = new UnicodeSet("[a{bc}]"); - if (x.contains("abc")) errln("FAIL"); - } - - public void TestExhaustive() { - // exhaustive tests. Simulate UnicodeSets with integers. - // That gives us very solid tests (except for large memory tests). - - char limit = (char)128; - - for (char i = 0; i < limit; ++i) { - logln("Testing " + i + ", " + bitsToSet(i)); - _testComplement(i); - - // AS LONG AS WE ARE HERE, check roundtrip - checkRoundTrip(bitsToSet(i)); - - for (char j = 0; j < limit; ++j) { - _testAdd(i,j); - _testXor(i,j); - _testRetain(i,j); - _testRemove(i,j); - } - } - } - - /** - * Make sure each script name and abbreviated name can be used - * to construct a UnicodeSet. - */ - public void TestScriptNames() { - for (int i=0; i " + set.toPattern(false)); - } catch (IllegalArgumentException e) { - if (pat.length() == 0) { - errln("FAIL (in UScript): No name for script " + i); - } else { - errln("FAIL: Couldn't create " + pat); - } - } - } - } - } - - /** - * Test closure API. - */ - public void TestCloseOver() { - String CASE = String.valueOf(UnicodeSet.CASE); - String[] DATA = { - // selector, input, output - CASE, - "[aq\u00DF{Bc}{bC}{Fi}]", - "[aAqQ\u00DF\u1E9E\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1 - - CASE, - "[\u01F1]", // 'DZ' - "[\u01F1\u01F2\u01F3]", - - CASE, - "[\u1FB4]", - "[\u1FB4{\u03AC\u03B9}]", - - CASE, - "[{F\uFB01}]", - "[\uFB03{ffi}]", - - CASE, - "[a-z]","[A-Za-z\u017F\u212A]", - CASE, - "[abc]","[A-Ca-c]", - CASE, - "[ABC]","[A-Ca-c]", - }; - - UnicodeSet s = new UnicodeSet(); - UnicodeSet t = new UnicodeSet(); - for (int i=0; i " + exp); - } else { - errln("FAIL: " + pat + ".closeOver(" + selector + ") => " + - s.toPattern(true) + ", expected " + exp); - } - } - - // Test the pattern API - s.applyPattern("[abc]", UnicodeSet.CASE); - expectContainment(s, "abcABC", "defDEF"); - s = new UnicodeSet("[^abc]", UnicodeSet.CASE); - expectContainment(s, "defDEF", "abcABC"); - } - - public void TestEscapePattern() { - // The following pattern must contain at least one range "c-d" - // for which isRuleWhiteSpace(c) or isRuleWhiteSpace(d) is true. - String pattern = - "[\\uFEFF \\u200E-\\u20FF \\uFFF9-\\uFFFC \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; - String exp = - "[\\u200E-\\u20FF\\uFEFF\\uFFF9-\\uFFFC\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]"; - // We test this with two passes; in the second pass we - // pre-unescape the pattern. Since U+200E is rule whitespace, - // this fails -- which is what we expect. - for (int pass=1; pass<=2; ++pass) { - String pat = pattern; - if (pass==2) { - pat = Utility.unescape(pat); - } - // Pattern is only good for pass 1 - boolean isPatternValid = (pass==1); - - UnicodeSet set = null; - try { - set = new UnicodeSet(pat); - } catch (IllegalArgumentException e) { - set = null; - } - if ((set != null) != isPatternValid){ - errln("FAIL: applyPattern(" + - Utility.escape(pat) + ") => " + set); - continue; - } - if (set == null) { - continue; - } - if (set.contains((char)0x0644)){ - errln("FAIL: " + Utility.escape(pat) + " contains(U+0664)"); - } - - String newpat = set.toPattern(true); - if (newpat.equals(exp)) { - logln(Utility.escape(pat) + " => " + newpat); - } else { - errln("FAIL: " + Utility.escape(pat) + " => " + newpat); - } - - for (int i=0; i 5) { - // iterations = (params.inclusion-5) * 200; - // } - // for (; i < iterations; ++i) { - // double start = random.nextGaussian() * 0x10000; - // if (start < 0) start = - start; - // if (start > 0x10FFFF) { - // start = 0x10FFFF; - // } - // double end = random.nextGaussian() * 0x100; - // if (end < 0) end = -end; - // end = start + end; - // if (end > 0x10FFFF) { - // end = 0x10FFFF; - // } - // test.complement((int)start, (int)end); - // checkPrettySet(pp, i, test); - // } - // }catch(RuntimeException ex){ - // warnln("Could not load Collator"); - // } - // } - - // private void checkPrettySet(PrettyPrinter pp, int i, UnicodeSet test) { - // String pretty = pp.toPattern(test); - // UnicodeSet retry = new UnicodeSet(pretty); - // if (!test.equals(retry)) { - // errln(i + ". Failed test: " + test + " != " + pretty); - // } else { - // logln(i + ". Worked for " + truncate(test.toString()) + " => " + truncate(pretty)); - // } - // } - - private String truncate(String string) { - if (string.length() <= 100) return string; - return string.substring(0,97) + "..."; - } - - public class TokenSymbolTable implements SymbolTable { - HashMap contents = new HashMap(); - + /** - * (Non-SymbolTable API) Add the given variable and value to - * the table. Variable should NOT contain leading '$'. + * Convert all escapes in a given string using unescapeAt(). + * Leave invalid escape sequences unchanged. */ - public void add(String var, String value) { - char[] buffer = new char[value.length()]; - value.getChars(0, value.length(), buffer, 0); - add(var, buffer); + public static String unescapeLeniently(String s) { + StringBuffer buf = new StringBuffer(); + int[] pos = new int[1]; + for (int i=0; i \"" + - new String(body) + "\""); - contents.put(var, body); + public void TestMinimalRep() { + // This is pretty thoroughly tested by checkCanonicalRep() + // run against the exhaustive operation results. Use the code + // here for debugging specific spot problems. + + // 1 overlap against 2 + UnicodeSet set = new UnicodeSet("[h-km-q]"); + UnicodeSet set2 = new UnicodeSet("[i-o]"); + set.addAll(set2); + expectPairs(set, "hq"); + // right + set.applyPattern("[a-m]"); + set2.applyPattern("[e-o]"); + set.addAll(set2); + expectPairs(set, "ao"); + // left + set.applyPattern("[e-o]"); + set2.applyPattern("[a-m]"); + set.addAll(set2); + expectPairs(set, "ao"); + // 1 overlap against 3 + set.applyPattern("[a-eg-mo-w]"); + set2.applyPattern("[d-q]"); + set.addAll(set2); + expectPairs(set, "aw"); + } + + public void TestAPI() { + // default ct + UnicodeSet set = new UnicodeSet(); + if (!set.isEmpty() || set.getRangeCount() != 0) { + errln("FAIL, set should be empty but isn't: " + + set); + } + + // clear(), isEmpty() + set.add('a'); + if (set.isEmpty()) { + errln("FAIL, set shouldn't be empty but is: " + + set); + } + set.clear(); + if (!set.isEmpty()) { + errln("FAIL, set should be empty but isn't: " + + set); + } + + // size() + set.clear(); + if (set.size() != 0) { + errln("FAIL, size should be 0, but is " + set.size() + + ": " + set); + } + set.add('a'); + if (set.size() != 1) { + errln("FAIL, size should be 1, but is " + set.size() + + ": " + set); + } + set.add('1', '9'); + if (set.size() != 10) { + errln("FAIL, size should be 10, but is " + set.size() + + ": " + set); + } + set.clear(); + set.complement(); + if (set.size() != 0x110000) { + errln("FAIL, size should be 0x110000, but is" + set.size()); + } + + // contains(first, last) + set.clear(); + set.applyPattern("[A-Y 1-8 b-d l-y]"); + for (int i = 0; i 0 && (filter & SortedSetRelation.A_AND_B) == 0) return false; + + // A - B size == A.size - A&B.size + if (A.size() > ab.size() && (filter & SortedSetRelation.A_NOT_B) == 0) return false; + + // B - A size == B.size - A&B.size + if (B.size() > ab.size() && (filter & SortedSetRelation.B_NOT_A) == 0) return false; + + + return true; + } + + void checkSetRelation(SortedSet a, SortedSet b, String message) { + for (int i = 0; i < 8; ++i) { + + boolean hasRelation = SortedSetRelation.hasRelation(a, i, b); + boolean dumbHasRelation = dumbHasRelation(a, i, b); + + logln(message + " " + hasRelation + ":\t" + a + "\t" + RELATION_NAME[i] + "\t" + b); + + if (hasRelation != dumbHasRelation) { + errln("FAIL: " + + message + " " + dumbHasRelation + ":\t" + a + "\t" + RELATION_NAME[i] + "\t" + b); + } + } + logln(""); + } + + /** + * Test the [:Latin:] syntax. + */ + public void TestScriptSet() { + + expectContainment("[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1")); + + expectContainment("[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA"); + + /* Jitterbug 1423 */ + expectContainment("[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA"); + + } + + /** + * Test the [:Latin:] syntax. + */ + public void TestPropertySet() { + String[] DATA = { + // Pattern, Chars IN, Chars NOT in + + "[:Latin:]", + "aA", + "\u0391\u03B1", + + "[\\p{Greek}]", + "\u0391\u03B1", + "aA", + + "\\P{ GENERAL Category = upper case letter }", + "abc", + "ABC", + + // Combining class: @since ICU 2.2 + // Check both symbolic and numeric + "\\p{ccc=Nukta}", + "\u0ABC", + "abc", + + "\\p{Canonical Combining Class = 11}", + "\u05B1", + "\u05B2", + + "[:c c c = iota subscript :]", + "\u0345", + "xyz", + + // Bidi class: @since ICU 2.2 + "\\p{bidiclass=lefttoright}", + "abc", + "\u0671\u0672", + + // Binary properties: @since ICU 2.2 + "\\p{ideographic}", + "\u4E0A", + "x", + + "[:math=false:]", + "q)*(", // )(and * were removed from math in Unicode 4.0.1 + "+<>^", + + // JB#1767 \N{}, \p{ASCII} + "[:Ascii:]", + "abc\u0000\u007F", + "\u0080\u4E00", + + "[\\N{ latin small letter a }[:name= latin small letter z:]]", + "az", + "qrs", + + // JB#2015 + "[:any:]", + "a\\U0010FFFF", + "", + + "[:nv=0.5:]", + "\u00BD\u0F2A", + "\u00BC", + + // JB#2653: Age + "[:Age=1.1:]", + "\u03D6", // 1.1 + "\u03D8\u03D9", // 3.2 + + "[:Age=3.1:]", + "\\u1800\\u3400\\U0002f800", + "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000", + + // JB#2350: Case_Sensitive + "[:Case Sensitive:]", + "A\u1FFC\\U00010410", + ";\u00B4\\U00010500", + + + // Regex compatibility test + "[-b]", // leading '-' is literal + "-b", + "ac", + + "[^-b]", // leading '-' is literal + "ac", + "-b", + + "[b-]", // trailing '-' is literal + "-b", + "ac", + + "[^b-]", // trailing '-' is literal + "ac", + "-b", + + "[a-b-]", // trailing '-' is literal + "ab-", + "c=", + + "[[a-q]&[p-z]-]", // trailing '-' is literal + "pq-", + "or=", + + "[\\s|\\)|:|$|\\>]", // from regex tests + "s|):$>", + "\\abc", + + "[\uDC00cd]", // JB#2906: isolated trail at start + "cd\uDC00", + "ab\uD800\\U00010000", + + "[ab\uD800]", // JB#2906: isolated trail at start + "ab\uD800", + "cd\uDC00\\U00010000", + + "[ab\uD800cd]", // JB#2906: isolated lead in middle + "abcd\uD800", + "ef\uDC00\\U00010000", + + "[ab\uDC00cd]", // JB#2906: isolated trail in middle + "abcd\uDC00", + "ef\uD800\\U00010000", + + "[:^lccc=0:]", // Lead canonical class + "\u0300\u0301", + "abcd\u00c0\u00c5", + + "[:^tccc=0:]", // Trail canonical class + "\u0300\u0301\u00c0\u00c5", + "abcd", + + "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class + "\u0300\u0301\u00c0\u00c5", + "abcd", + + "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now) + "", + "abcd\u0300\u0301\u00c0\u00c5", + + "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not + "\u0F73\u0F75\u0F81", + "abcd\u0300\u0301\u00c0\u00c5", + + "[:Assigned:]", + "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD", + "\\u0888\\uFDD3\\uFFFE\\U00050005", + + }; + + for (int i=0; i indexOf() => " + set.indexOf(c)); + } + } + int c = set.charAt(set.size()); + if (c != -1) { + errln("FAIL: charAt() = " + + Utility.escape(String.valueOf(c))); + } + int j = set.indexOf('q'); + if (j != -1) { + errln("FAIL: indexOf('q') = " + j); + } + } + + public void TestContainsString() { + UnicodeSet x = new UnicodeSet("[a{bc}]"); + if (x.contains("abc")) errln("FAIL"); + } + + public void TestExhaustive() { + // exhaustive tests. Simulate UnicodeSets with integers. + // That gives us very solid tests (except for large memory tests). + + char limit = (char)128; + + for (char i = 0; i < limit; ++i) { + logln("Testing " + i + ", " + bitsToSet(i)); + _testComplement(i); + + // AS LONG AS WE ARE HERE, check roundtrip + checkRoundTrip(bitsToSet(i)); + + for (char j = 0; j < limit; ++j) { + _testAdd(i,j); + _testXor(i,j); + _testRetain(i,j); + _testRemove(i,j); + } + } + } + + /** + * Make sure each script name and abbreviated name can be used + * to construct a UnicodeSet. + */ + public void TestScriptNames() { + for (int i=0; i " + set.toPattern(false)); + } catch (IllegalArgumentException e) { + if (pat.length() == 0) { + errln("FAIL (in UScript): No name for script " + i); + } else { + errln("FAIL: Couldn't create " + pat); + } + } + } + } + } + + /** + * Test closure API. + */ + public void TestCloseOver() { + String CASE = String.valueOf(UnicodeSet.CASE); + String[] DATA = { + // selector, input, output + CASE, + "[aq\u00DF{Bc}{bC}{Fi}]", + "[aAqQ\u00DF\u1E9E\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1 + + CASE, + "[\u01F1]", // 'DZ' + "[\u01F1\u01F2\u01F3]", + + CASE, + "[\u1FB4]", + "[\u1FB4{\u03AC\u03B9}]", + + CASE, + "[{F\uFB01}]", + "[\uFB03{ffi}]", + + CASE, + "[a-z]","[A-Za-z\u017F\u212A]", + CASE, + "[abc]","[A-Ca-c]", + CASE, + "[ABC]","[A-Ca-c]", + }; + + UnicodeSet s = new UnicodeSet(); + UnicodeSet t = new UnicodeSet(); + for (int i=0; i " + exp); + } else { + errln("FAIL: " + pat + ".closeOver(" + selector + ") => " + + s.toPattern(true) + ", expected " + exp); + } + } + + // Test the pattern API + s.applyPattern("[abc]", UnicodeSet.CASE); + expectContainment(s, "abcABC", "defDEF"); + s = new UnicodeSet("[^abc]", UnicodeSet.CASE); + expectContainment(s, "defDEF", "abcABC"); + } + + public void TestEscapePattern() { + // The following pattern must contain at least one range "c-d" + // for which isRuleWhiteSpace(c) or isRuleWhiteSpace(d) is true. + String pattern = + "[\\uFEFF \\u200E-\\u20FF \\uFFF9-\\uFFFC \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; + String exp = + "[\\u200E-\\u20FF\\uFEFF\\uFFF9-\\uFFFC\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]"; + // We test this with two passes; in the second pass we + // pre-unescape the pattern. Since U+200E is rule whitespace, + // this fails -- which is what we expect. + for (int pass=1; pass<=2; ++pass) { + String pat = pattern; + if (pass==2) { + pat = Utility.unescape(pat); + } + // Pattern is only good for pass 1 + boolean isPatternValid = (pass==1); + + UnicodeSet set = null; + try { + set = new UnicodeSet(pat); + } catch (IllegalArgumentException e) { + set = null; + } + if ((set != null) != isPatternValid){ + errln("FAIL: applyPattern(" + + Utility.escape(pat) + ") => " + set); + continue; + } + if (set == null) { + continue; + } + if (set.contains((char)0x0644)){ + errln("FAIL: " + Utility.escape(pat) + " contains(U+0664)"); + } + + String newpat = set.toPattern(true); + if (newpat.equals(exp)) { + logln(Utility.escape(pat) + " => " + newpat); + } else { + errln("FAIL: " + Utility.escape(pat) + " => " + newpat); + } + + for (int i=0; i iterator() { + + ArrayList oldList = new ArrayList(); + for (UnicodeSetIterator it = new UnicodeSetIterator(set1); it.next();) { + oldList.add(it.getString()); + } + + ArrayList list1 = new ArrayList(); + for (String s : set1) { + list1.add(s); + } + assertEquals("iteration test", oldList, list1); + + //addAllTo(Iterable, U) + list1.clear(); + set1.addAllTo(list1); + assertEquals("iteration test", oldList, list1); + + list1 = set1.addAllTo(new ArrayList()); + assertEquals("addAllTo", oldList, list1); + + ArrayList list2 = set2.addAllTo(new ArrayList()); + ArrayList list3 = set3.addAllTo(new ArrayList()); + + // put them into different order, to check that order doesn't matter + TreeSet sorted1 = set1.addAllTo(new TreeSet()); + TreeSet sorted2 = set2.addAllTo(new TreeSet()); + TreeSet sorted3 = set3.addAllTo(new TreeSet()); + + //containsAll(Collection collection) + assertTrue("containsAll", set1.containsAll(list1)); + assertTrue("containsAll", set1.containsAll(sorted1)); + assertTrue("containsAll", set1.containsAll(list2)); + assertTrue("containsAll", set1.containsAll(sorted2)); + assertFalse("containsAll", set1.containsAll(list3)); + assertFalse("containsAll", set1.containsAll(sorted3)); + assertFalse("containsAll", set2.containsAll(list3)); + assertFalse("containsAll", set2.containsAll(sorted3)); + + //containsSome(Collection) + assertTrue("containsSome", set1.containsSome(list1)); + assertTrue("containsSome", set1.containsSome(sorted1)); + assertTrue("containsSome", set1.containsSome(list2)); + assertTrue("containsSome", set1.containsSome(sorted2)); + assertTrue("containsSome", set1.containsSome(list3)); + assertTrue("containsSome", set1.containsSome(sorted3)); + assertFalse("containsSome", set2.containsSome(list3)); + assertFalse("containsSome", set2.containsSome(sorted3)); + + //containsNone(Collection) + assertFalse("containsNone", set1.containsNone(list1)); + assertFalse("containsNone", set1.containsNone(sorted1)); + assertFalse("containsNone", set1.containsNone(list2)); + assertFalse("containsNone", set1.containsNone(sorted2)); + assertFalse("containsNone", set1.containsNone(list3)); + assertFalse("containsNone", set1.containsNone(sorted3)); + assertTrue("containsNone", set2.containsNone(list3)); + assertTrue("containsNone", set2.containsNone(sorted3)); + + //addAll(String...) + UnicodeSet other3 = new UnicodeSet().addAll("d", "m", "n", "dh"); + assertEquals("addAll", set3, other3); + + //removeAll(Collection) + UnicodeSet mod1 = new UnicodeSet(set1).removeAll(set2); + UnicodeSet mod2 = new UnicodeSet(set1).removeAll(list2); + assertEquals("remove all", mod1, mod2); + + //retainAll(Collection) + mod1 = new UnicodeSet(set1).retainAll(set2); + mod2 = new UnicodeSet(set1).retainAll(set2.addAllTo(new LinkedHashSet())); + assertEquals("remove all", mod1, mod2); } - /* (non-Javadoc) - * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String) - */ - public char[] lookup(String s) { - logln("TokenSymbolTable: lookup \"" + s + "\" => \"" + - new String((char[]) contents.get(s)) + "\""); - return (char[])contents.get(s); + public void TestComparison() { + UnicodeSet set1 = new UnicodeSet("[a-b d-g {ch} {zh}]").freeze(); + UnicodeSet set2 = new UnicodeSet("[e-f {ch}]").freeze(); + UnicodeSet set3 = new UnicodeSet("[d m-n {dh}]").freeze(); + + //compareTo(UnicodeSet) + // do indirectly, by sorting + List unsorted = Arrays.asList(set1, set2, set3); + List goal = Arrays.asList(set1, set3, set2); + + List sorted = new ArrayList(new TreeSet(unsorted)); + assertNotEquals("compareTo", unsorted, sorted); + assertEquals("compareTo", goal, sorted); + + //compare(String, int) + // make a list of interesting combinations + List sources = Arrays.asList("\u0000", "a", "b", "\uD7FF", "\uD800", "\uDBFF", "\uDC00", "\uDFFF", "\uE000", "\uFFFD", "\uFFFF"); + TreeSet target = new TreeSet(); + for (String s : sources) { + target.add(s); + for (String t : sources) { + target.add(s + t); + for (String u : sources) { + target.add(s + t + u); + } + } + } + // now compare all the combinations. If any of them is a code point, use it. + for (String last : target) { + for (String curr : target) { + int lastCount = Character.codePointCount(last, 0, last.length()); + int currCount = Character.codePointCount(curr, 0, curr.length()); + int comparison; + if (lastCount == 1) { + comparison = UnicodeSet.compare(last.codePointAt(0), curr); + } else if (currCount == 1) { + comparison = UnicodeSet.compare(last, curr.codePointAt(0)); + } else { + continue; + } + if (comparison != last.compareTo(curr)) { + // repeat for debugging + if (lastCount == 1) { + comparison = UnicodeSet.compare(last.codePointAt(0), curr); + } else if (currCount == 1) { + comparison = UnicodeSet.compare(last, curr.codePointAt(0)); + } + errln("Failure in comparing " + last + " & " + curr); + } + } + } + + //compare(Iterable, Iterable) + int max = 10; + List test1 = new ArrayList(max); + List test2 = new ArrayList(max); + for (int i = 0; i <= max; ++i) { + test1.add("a" + i); + test2.add("a" + (max - i)); // add in reverse order + } + assertNotEquals("compare iterable test", test1, test2); + TreeSet sortedTest1 = new TreeSet(test1); + TreeSet sortedTest2 = new TreeSet(test2); + assertEquals("compare iterable test", sortedTest1, sortedTest2); } - - /* (non-Javadoc) - * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int) - */ - public UnicodeMatcher lookupMatcher(int ch) { - return null; + + // **************************************** + // UTILITIES + // **************************************** + + public void checkModification(UnicodeSet original, boolean isFrozen) { + main: + for (int i = 0; ;++i) { + UnicodeSet test = (UnicodeSet) (isFrozen ? original.clone() : original.cloneAsThawed()); + boolean gotException = true; + boolean checkEquals = true; + try { + switch(i) { + case 0: test.add(0); break; + case 1: test.add(0,1); break; + case 2: test.add("a"); break; + case 3: List a = new ArrayList(); a.add("a"); test.addAll(a); break; + case 4: test.addAll("ab"); break; + case 5: test.addAll(new UnicodeSet("[ab]")); break; + case 6: test.applyIntPropertyValue(0,0); break; + case 7: test.applyPattern("[ab]"); break; + case 8: test.applyPattern("[ab]", true); break; + case 9: test.applyPattern("[ab]", 0); break; + case 10: test.applyPropertyAlias("hex","true"); break; + case 11: test.applyPropertyAlias("hex", "true", null); break; + case 12: test.closeOver(UnicodeSet.CASE); break; + case 13: test.compact(); checkEquals = false; break; + case 14: test.complement(0); break; + case 15: test.complement(0,0); break; + case 16: test.complement("ab"); break; + case 17: test.complementAll("ab"); break; + case 18: test.complementAll(new UnicodeSet("[ab]")); break; + case 19: test.remove(' '); break; + case 20: test.remove(' ','a'); break; + case 21: test.remove(" "); break; + case 22: test.removeAll(" a"); break; + case 23: test.removeAll(new UnicodeSet("[\\ a]")); break; + case 24: test.retain(' '); break; + case 25: test.retain(' ','a'); break; + case 26: test.retain(" "); break; + case 27: test.retainAll(" a"); break; + case 28: test.retainAll(new UnicodeSet("[\\ a]")); break; + case 29: test.set(0,1); break; + case 30: test.set(new UnicodeSet("[ab]")); break; + + default: continue main; // so we don't keep having to change the endpoint, and gaps are not skipped. + case 35: return; + } + gotException = false; + } catch (UnsupportedOperationException e) { + // do nothing + } + if (isFrozen && !gotException) errln(i + ") attempt to modify frozen object didn't result in an exception"); + if (!isFrozen && gotException) errln(i + ") attempt to modify thawed object did result in an exception"); + if (checkEquals) { + if (test.equals(original)) { + if (!isFrozen) errln(i + ") attempt to modify thawed object didn't change the object"); + } else { // unequal + if (isFrozen) errln(i + ") attempt to modify frozen object changed the object"); + } + } + } } - - /* (non-Javadoc) - * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, + + // String[] prettyData = { + // "[\\uD7DE-\\uD90C \\uDCB5-\\uDD9F]", // special case + // "[:any:]", + // "[:whitespace:]", + // "[:linebreak=AL:]", + // }; + + // public void TestPrettyPrinting() { + // try{ + // PrettyPrinter pp = new PrettyPrinter(); + + // int i = 0; + // for (; i < prettyData.length; ++i) { + // UnicodeSet test = new UnicodeSet(prettyData[i]); + // checkPrettySet(pp, i, test); + // } + // Random random = new Random(0); + // UnicodeSet test = new UnicodeSet(); + + // // To keep runtimes under control, make the number of random test cases + // // to try depends on the test framework exhaustive setting. + // // params.inclusions = 5: default exhaustive value + // // params.inclusions = 10: max exhaustive value. + // int iterations = 50; + // if (params.inclusion > 5) { + // iterations = (params.inclusion-5) * 200; + // } + // for (; i < iterations; ++i) { + // double start = random.nextGaussian() * 0x10000; + // if (start < 0) start = - start; + // if (start > 0x10FFFF) { + // start = 0x10FFFF; + // } + // double end = random.nextGaussian() * 0x100; + // if (end < 0) end = -end; + // end = start + end; + // if (end > 0x10FFFF) { + // end = 0x10FFFF; + // } + // test.complement((int)start, (int)end); + // checkPrettySet(pp, i, test); + // } + // }catch(RuntimeException ex){ + // warnln("Could not load Collator"); + // } + // } + + // private void checkPrettySet(PrettyPrinter pp, int i, UnicodeSet test) { + // String pretty = pp.toPattern(test); + // UnicodeSet retry = new UnicodeSet(pretty); + // if (!test.equals(retry)) { + // errln(i + ". Failed test: " + test + " != " + pretty); + // } else { + // logln(i + ". Worked for " + truncate(test.toString()) + " => " + truncate(pretty)); + // } + // } + + private String truncate(String string) { + if (string.length() <= 100) return string; + return string.substring(0,97) + "..."; + } + + public class TokenSymbolTable implements SymbolTable { + HashMap contents = new HashMap(); + + /** + * (Non-SymbolTable API) Add the given variable and value to + * the table. Variable should NOT contain leading '$'. + */ + public void add(String var, String value) { + char[] buffer = new char[value.length()]; + value.getChars(0, value.length(), buffer, 0); + add(var, buffer); + } + + /** + * (Non-SymbolTable API) Add the given variable and value to + * the table. Variable should NOT contain leading '$'. + */ + public void add(String var, char[] body) { + logln("TokenSymbolTable: add \"" + var + "\" => \"" + + new String(body) + "\""); + contents.put(var, body); + } + + /* (non-Javadoc) + * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String) + */ + public char[] lookup(String s) { + logln("TokenSymbolTable: lookup \"" + s + "\" => \"" + + new String((char[]) contents.get(s)) + "\""); + return (char[])contents.get(s); + } + + /* (non-Javadoc) + * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int) + */ + public UnicodeMatcher lookupMatcher(int ch) { + return null; + } + + /* (non-Javadoc) + * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int) + */ + public String parseReference(String text, ParsePosition pos, int + limit) { + int cp; + int start = pos.getIndex(); + int i; + for (i = start; i < limit; i += UTF16.getCharCount(cp)) { + cp = UTF16.charAt(text, i); + if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) { + break; + } + } + logln("TokenSymbolTable: parse \"" + text + "\" from " + + start + " to " + i + + " => \"" + text.substring(start,i) + "\""); + pos.setIndex(i); + return text.substring(start,i); + } + } + + public void TestSurrogate() { + String DATA[] = { + // These should all behave identically + "[abc\\uD800\\uDC00]", + "[abc\uD800\uDC00]", + "[abc\\U00010000]", + }; + for (int i=0; i= 0 but is " + + n + " for " + Utility.escape(set.toString())); + return; } - } - logln("TokenSymbolTable: parse \"" + text + "\" from " + - start + " to " + i + - " => \"" + text.substring(start,i) + "\""); - pos.setIndex(i); - return text.substring(start,i); - } - } - - public void TestSurrogate() { - String DATA[] = { - // These should all behave identically - "[abc\\uD800\\uDC00]", - "[abc\uD800\uDC00]", - "[abc\\U00010000]", - }; - for (int i=0; i end) { + errln("FAIL result of " + msg + + ": range " + (i+1) + + " start > end: " + start + ", " + end + + " for " + Utility.escape(set.toString())); + } + if (i > 0 && start <= last) { + errln("FAIL result of " + msg + + ": range " + (i+1) + + " overlaps previous range: " + start + ", " + end + + " for " + Utility.escape(set.toString())); + } + last = end; } - if (containsAll != x.containsAll(y)) { - x.containsAll(y); // repeat for debugging - errln("FAILED: " + x + " containsAll " + y); + } + + /** + * Convert a bitmask to a UnicodeSet. + */ + UnicodeSet bitsToSet(int a) { + UnicodeSet result = new UnicodeSet(); + for (int i = 0; i < 32; ++i) { + if ((a & (1< 0xFFFF) { + end = 0xFFFF; + i = set.getRangeCount(); // Should be unnecessary + } + pairs.append((char)start).append((char)end); + } + return pairs.toString(); } - checkCanonicalRep(z, "complement " + a); - } - - void _testAdd(int a, int b) { - UnicodeSet x = bitsToSet(a); - UnicodeSet y = bitsToSet(b); - UnicodeSet z = bitsToSet(a); - z.addAll(y); - int c = setToBits(z); - if (c != (a | b)) { - errln(Utility.escape("FAILED: add: " + x + " | " + y + " != " + z)); - errln("FAILED: add: " + a + " | " + b + " != " + c); + + /** + * Test function. Make sure that the sets have the right relation + */ + + void expectRelation(Object relationObj, Object set1Obj, Object set2Obj, String message) { + int relation = ((Integer) relationObj).intValue(); + UnicodeSet set1 = (UnicodeSet) set1Obj; + UnicodeSet set2 = (UnicodeSet) set2Obj; + + // by-the-by, check the iterator + checkRoundTrip(set1); + checkRoundTrip(set2); + + boolean contains = set1.containsAll(set2); + boolean isContained = set2.containsAll(set1); + boolean disjoint = set1.containsNone(set2); + boolean equals = set1.equals(set2); + + UnicodeSet intersection = new UnicodeSet(set1).retainAll(set2); + UnicodeSet minus12 = new UnicodeSet(set1).removeAll(set2); + UnicodeSet minus21 = new UnicodeSet(set2).removeAll(set1); + + // test basic properties + + if (contains != (intersection.size() == set2.size())) { + errln("FAIL contains1" + set1.toPattern(true) + ", " + set2.toPattern(true)); + } + + if (contains != (intersection.equals(set2))) { + errln("FAIL contains2" + set1.toPattern(true) + ", " + set2.toPattern(true)); + } + + if (isContained != (intersection.size() == set1.size())) { + errln("FAIL isContained1" + set1.toPattern(true) + ", " + set2.toPattern(true)); + } + + if (isContained != (intersection.equals(set1))) { + errln("FAIL isContained2" + set1.toPattern(true) + ", " + set2.toPattern(true)); + } + + if ((contains && isContained) != equals) { + errln("FAIL equals" + set1.toPattern(true) + ", " + set2.toPattern(true)); + } + + if (disjoint != (intersection.size() == 0)) { + errln("FAIL disjoint" + set1.toPattern(true) + ", " + set2.toPattern(true)); + } + + // Now see if the expected relation is true + int status = (minus12.size() != 0 ? 4 : 0) + | (intersection.size() != 0 ? 2 : 0) + | (minus21.size() != 0 ? 1 : 0); + + if (status != relation) { + errln("FAIL relation incorrect" + message + + "; desired = " + RELATION_NAME[relation] + + "; found = " + RELATION_NAME[status] + + "; set1 = " + set1.toPattern(true) + + "; set2 = " + set2.toPattern(true) + ); + } } - checkCanonicalRep(z, "add " + a + "," + b); - } - - void _testRetain(int a, int b) { - UnicodeSet x = bitsToSet(a); - UnicodeSet y = bitsToSet(b); - UnicodeSet z = bitsToSet(a); - z.retainAll(y); - int c = setToBits(z); - if (c != (a & b)) { - errln("FAILED: retain: " + x + " & " + y + " != " + z); - errln("FAILED: retain: " + a + " & " + b + " != " + c); + + /** + * Basic consistency check for a few items. + * That the iterator works, and that we can create a pattern and + * get the same thing back + */ + + void checkRoundTrip(UnicodeSet s) { + String pat = s.toPattern(false); + UnicodeSet t = copyWithIterator(s, false); + checkEqual(s, t, "iterator roundtrip"); + + t = copyWithIterator(s, true); // try range + checkEqual(s, t, "iterator roundtrip"); + + t = new UnicodeSet(pat); + checkEqual(s, t, "toPattern(false)"); + + pat = s.toPattern(true); + t = new UnicodeSet(pat); + checkEqual(s, t, "toPattern(true)"); } - checkCanonicalRep(z, "retain " + a + "," + b); - } - - void _testRemove(int a, int b) { - UnicodeSet x = bitsToSet(a); - UnicodeSet y = bitsToSet(b); - UnicodeSet z = bitsToSet(a); - z.removeAll(y); - int c = setToBits(z); - if (c != (a &~ b)) { - errln("FAILED: remove: " + x + " &~ " + y + " != " + z); - errln("FAILED: remove: " + a + " &~ " + b + " != " + c); - } - checkCanonicalRep(z, "remove " + a + "," + b); - } - - void _testXor(int a, int b) { - UnicodeSet x = bitsToSet(a); - UnicodeSet y = bitsToSet(b); - UnicodeSet z = bitsToSet(a); - z.complementAll(y); - int c = setToBits(z); - if (c != (a ^ b)) { - errln("FAILED: complement: " + x + " ^ " + y + " != " + z); - errln("FAILED: complement: " + a + " ^ " + b + " != " + c); - } - checkCanonicalRep(z, "complement " + a + "," + b); - } - - /** - * Check that ranges are monotonically increasing and non- - * overlapping. - */ - void checkCanonicalRep(UnicodeSet set, String msg) { - int n = set.getRangeCount(); - if (n < 0) { - errln("FAIL result of " + msg + - ": range count should be >= 0 but is " + - n + " for " + Utility.escape(set.toString())); - return; - } - int last = 0; - for (int i=0; i end) { - errln("FAIL result of " + msg + - ": range " + (i+1) + - " start > end: " + start + ", " + end + - " for " + Utility.escape(set.toString())); - } - if (i > 0 && start <= last) { - errln("FAIL result of " + msg + - ": range " + (i+1) + - " overlaps previous range: " + start + ", " + end + - " for " + Utility.escape(set.toString())); - } - last = end; - } - } - - /** - * Convert a bitmask to a UnicodeSet. - */ - UnicodeSet bitsToSet(int a) { - UnicodeSet result = new UnicodeSet(); - for (int i = 0; i < 32; ++i) { - if ((a & (1< 0xFFFF) { - end = 0xFFFF; - i = set.getRangeCount(); // Should be unnecessary - } - pairs.append((char)start).append((char)end); - } - return pairs.toString(); - } - - /** - * Test function. Make sure that the sets have the right relation - */ - - void expectRelation(Object relationObj, Object set1Obj, Object set2Obj, String message) { - int relation = ((Integer) relationObj).intValue(); - UnicodeSet set1 = (UnicodeSet) set1Obj; - UnicodeSet set2 = (UnicodeSet) set2Obj; - - // by-the-by, check the iterator - checkRoundTrip(set1); - checkRoundTrip(set2); - - boolean contains = set1.containsAll(set2); - boolean isContained = set2.containsAll(set1); - boolean disjoint = set1.containsNone(set2); - boolean equals = set1.equals(set2); - - UnicodeSet intersection = new UnicodeSet(set1).retainAll(set2); - UnicodeSet minus12 = new UnicodeSet(set1).removeAll(set2); - UnicodeSet minus21 = new UnicodeSet(set2).removeAll(set1); - - // test basic properties - - if (contains != (intersection.size() == set2.size())) { - errln("FAIL contains1" + set1.toPattern(true) + ", " + set2.toPattern(true)); - } - - if (contains != (intersection.equals(set2))) { - errln("FAIL contains2" + set1.toPattern(true) + ", " + set2.toPattern(true)); - } - - if (isContained != (intersection.size() == set1.size())) { - errln("FAIL isContained1" + set1.toPattern(true) + ", " + set2.toPattern(true)); - } - - if (isContained != (intersection.equals(set1))) { - errln("FAIL isContained2" + set1.toPattern(true) + ", " + set2.toPattern(true)); - } - - if ((contains && isContained) != equals) { - errln("FAIL equals" + set1.toPattern(true) + ", " + set2.toPattern(true)); - } - - if (disjoint != (intersection.size() == 0)) { - errln("FAIL disjoint" + set1.toPattern(true) + ", " + set2.toPattern(true)); - } - - // Now see if the expected relation is true - int status = (minus12.size() != 0 ? 4 : 0) - | (intersection.size() != 0 ? 2 : 0) - | (minus21.size() != 0 ? 1 : 0); - - if (status != relation) { - errln("FAIL relation incorrect" + message - + "; desired = " + RELATION_NAME[relation] - + "; found = " + RELATION_NAME[status] - + "; set1 = " + set1.toPattern(true) - + "; set2 = " + set2.toPattern(true) - ); - } - } - - /** - * Basic consistency check for a few items. - * That the iterator works, and that we can create a pattern and - * get the same thing back - */ - - void checkRoundTrip(UnicodeSet s) { - String pat = s.toPattern(false); - UnicodeSet t = copyWithIterator(s, false); - checkEqual(s, t, "iterator roundtrip"); - - t = copyWithIterator(s, true); // try range - checkEqual(s, t, "iterator roundtrip"); - - t = new UnicodeSet(pat); - checkEqual(s, t, "toPattern(false)"); - - pat = s.toPattern(true); - t = new UnicodeSet(pat); - checkEqual(s, t, "toPattern(true)"); - } - - UnicodeSet copyWithIterator(UnicodeSet s, boolean withRange) { - UnicodeSet t = new UnicodeSet(); - UnicodeSetIterator it = new UnicodeSetIterator(s); - if (withRange) { - while (it.nextRange()) { - if (it.codepoint == UnicodeSetIterator.IS_STRING) { - t.add(it.string); + + UnicodeSet copyWithIterator(UnicodeSet s, boolean withRange) { + UnicodeSet t = new UnicodeSet(); + UnicodeSetIterator it = new UnicodeSetIterator(s); + if (withRange) { + while (it.nextRange()) { + if (it.codepoint == UnicodeSetIterator.IS_STRING) { + t.add(it.string); + } else { + t.add(it.codepoint, it.codepointEnd); + } + } } else { - t.add(it.codepoint, it.codepointEnd); + while (it.next()) { + if (it.codepoint == UnicodeSetIterator.IS_STRING) { + t.add(it.string); + } else { + t.add(it.codepoint); + } + } } - } - } else { - while (it.next()) { - if (it.codepoint == UnicodeSetIterator.IS_STRING) { - t.add(it.string); + return t; + } + + boolean checkEqual(UnicodeSet s, UnicodeSet t, String message) { + if (!s.equals(t)) { + errln("FAIL " + message + + "; source = " + s.toPattern(true) + + "; result = " + t.toPattern(true) + ); + return false; + } + return true; + } + + void expectEqual(String name, String pat1, String pat2) { + UnicodeSet set1, set2; + try { + set1 = new UnicodeSet(pat1); + set2 = new UnicodeSet(pat2); + } catch (IllegalArgumentException e) { + errln("FAIL: Couldn't create UnicodeSet from pattern for \"" + name + "\": " + e.getMessage()); + return; + } + if(!set1.equals(set2)) { + errln("FAIL: Sets built from patterns differ for \"" + name + "\""); + } + } + + /** + * Expect the given set to contain the characters in charsIn and + * to not contain those in charsOut. + */ + void expectContainment(String pat, String charsIn, String charsOut) { + UnicodeSet set; + try { + set = new UnicodeSet(pat); + } catch (IllegalArgumentException e) { + errln("FAIL: Couldn't create UnicodeSet from pattern \"" + + pat + "\": " + e.getMessage()); + return; + } + expectContainment(set, charsIn, charsOut); + } + + /** + * Expect the given set to contain the characters in charsIn and + * to not contain those in charsOut. + */ + void expectContainment(UnicodeSet set, String charsIn, String charsOut) { + StringBuffer bad = new StringBuffer(); + if (charsIn != null) { + charsIn = Utility.unescape(charsIn); + for (int i=0; i 0) { + errln(Utility.escape("FAIL: set " + set + " does not contain " + bad + + ", expected containment of " + charsIn)); + } else { + logln(Utility.escape("Ok: set " + set + " contains " + charsIn)); + } + } + if (charsOut != null) { + charsOut = Utility.unescape(charsOut); + bad.setLength(0); + for (int i=0; i 0) { + errln(Utility.escape("FAIL: set " + set + " contains " + bad + + ", expected non-containment of " + charsOut)); + } else { + logln(Utility.escape("Ok: set " + set + " does not contain " + charsOut)); + } + } + } + + void expectPattern(UnicodeSet set, + String pattern, + String expectedPairs) { + set.applyPattern(pattern); + if (!getPairs(set).equals(expectedPairs)) { + errln("FAIL: applyPattern(\"" + pattern + + "\") => pairs \"" + + Utility.escape(getPairs(set)) + "\", expected \"" + + Utility.escape(expectedPairs) + "\""); } else { - t.add(it.codepoint); + logln("Ok: applyPattern(\"" + pattern + + "\") => pairs \"" + + Utility.escape(getPairs(set)) + "\""); } - } } - return t; - } - - boolean checkEqual(UnicodeSet s, UnicodeSet t, String message) { - if (!s.equals(t)) { - errln("FAIL " + message - + "; source = " + s.toPattern(true) - + "; result = " + t.toPattern(true) - ); - return false; - } - return true; - } - - void expectEqual(String name, String pat1, String pat2) { - UnicodeSet set1, set2; - try { - set1 = new UnicodeSet(pat1); - set2 = new UnicodeSet(pat2); - } catch (IllegalArgumentException e) { - errln("FAIL: Couldn't create UnicodeSet from pattern for \"" + name + "\": " + e.getMessage()); - return; - } - if(!set1.equals(set2)) { - errln("FAIL: Sets built from patterns differ for \"" + name + "\""); - } - } - - /** - * Expect the given set to contain the characters in charsIn and - * to not contain those in charsOut. - */ - void expectContainment(String pat, String charsIn, String charsOut) { - UnicodeSet set; - try { - set = new UnicodeSet(pat); - } catch (IllegalArgumentException e) { - errln("FAIL: Couldn't create UnicodeSet from pattern \"" + - pat + "\": " + e.getMessage()); - return; - } - expectContainment(set, charsIn, charsOut); - } - - /** - * Expect the given set to contain the characters in charsIn and - * to not contain those in charsOut. - */ - void expectContainment(UnicodeSet set, String charsIn, String charsOut) { - StringBuffer bad = new StringBuffer(); - if (charsIn != null) { - charsIn = Utility.unescape(charsIn); - for (int i=0; i \"" + pat + "\""); + } else { + errln("FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\""); + return; } - } - if (bad.length() > 0) { - errln(Utility.escape("FAIL: set " + set + " does not contain " + bad + - ", expected containment of " + charsIn)); - } else { - logln(Utility.escape("Ok: set " + set + " contains " + charsIn)); - } - } - if (charsOut != null) { - charsOut = Utility.unescape(charsOut); - bad.setLength(0); - for (int i=0; i 0) { - errln(Utility.escape("FAIL: set " + set + " contains " + bad + - ", expected non-containment of " + charsOut)); - } else { - logln(Utility.escape("Ok: set " + set + " does not contain " + charsOut)); - } } - } - - void expectPattern(UnicodeSet set, - String pattern, - String expectedPairs) { - set.applyPattern(pattern); - if (!getPairs(set).equals(expectedPairs)) { - errln("FAIL: applyPattern(\"" + pattern + - "\") => pairs \"" + - Utility.escape(getPairs(set)) + "\", expected \"" + - Utility.escape(expectedPairs) + "\""); - } else { - logln("Ok: applyPattern(\"" + pattern + - "\") => pairs \"" + - Utility.escape(getPairs(set)) + "\""); + + void expectPairs(UnicodeSet set, String expectedPairs) { + if (!getPairs(set).equals(expectedPairs)) { + errln("FAIL: Expected pair list \"" + + Utility.escape(expectedPairs) + "\", got \"" + + Utility.escape(getPairs(set)) + "\""); + } } - } - - void expectToPattern(UnicodeSet set, - String expPat, - String[] expStrings) { - String pat = set.toPattern(true); - if (pat.equals(expPat)) { - logln("Ok: toPattern() => \"" + pat + "\""); - } else { - errln("FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\""); - return; + static final String CharsToUnicodeString(String s) { + return Utility.unescape(s); } - if (expStrings == null) { - return; - } - boolean in = true; - for (int i=0; i