ICU-13702 allow empty string in UnicodeSet

See #1519
This commit is contained in:
Markus Scherer 2020-12-29 00:39:40 +00:00
parent fb715eab4c
commit 9963b4d62a
17 changed files with 231 additions and 150 deletions

View file

@ -25,15 +25,25 @@ Here are a few examples of sets:
| `[abc123]` | The six characters a,b,c,1,2 and 3 |
| `[\p{Letter}]` | All characters with the Unicode General Category of Letter. |
String Values In addition to being a set of characters (of Unicode code points),
### String Values
In addition to being a set of characters (of Unicode code points),
a UnicodeSet may also contain string values. Conceptually, the UnicodeSet is
always a set of strings, not a set of characters, although in many common use
cases the strings are all of length one, which reduces to being a set of
characters.
This concept can be confusing when first encountered, probably because similar
set constructs from other environments (regular expressions) can only contain
characters.
set constructs from other environments
(e.g., character classes in most regular expression implementations)
can only contain characters.
Until ICU 68, it was not possible for a UnicodeSet to contain the empty string.
In Java, an exception was thrown. In C++, the empty string was silently ignored.
Starting with ICU 69 [ICU-13702](https://unicode-org.atlassian.net/browse/ICU-13702)
the empty string is supported as a set element;
however, it is ignored in matching functions such as `span(string)`.
## UnicodeSet Patterns

View file

@ -178,8 +178,6 @@ class RuleCharacterIterator;
* Unicode property
* </table>
*
* <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
*
* <p><b>Formal syntax</b></p>
*
* \htmlonly<blockquote>\endhtmlonly
@ -1104,8 +1102,8 @@ public:
* present. If this set already contains the multicharacter,
* the call leaves this set unchanged.
* Thus "ch" => {"ch"}
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
* A frozen set will not be modified.
*
* @param s the source string
* @return this object, for chaining
* @stable ICU 2.4
@ -1165,7 +1163,7 @@ public:
/**
* Makes a set from a multicharacter string. Thus "ch" => {"ch"}
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
*
* @param s the source string
* @return a newly created set containing the given string.
* The caller owns the return object and is responsible for deleting it.
@ -1279,8 +1277,8 @@ public:
* Complement the specified string in this set.
* The set will not contain the specified string once the call
* returns.
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
* A frozen set will not be modified.
*
* @param s the string to complement
* @return this object, for chaining
* @stable ICU 2.4

View file

@ -444,7 +444,6 @@ UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
* @return <tt>true</tt> if this set contains the specified string
*/
UBool UnicodeSet::contains(const UnicodeString& s) const {
if (s.length() == 0) return FALSE;
int32_t cp = getSingleCP(s);
if (cp < 0) {
return stringsContains(s);
@ -559,11 +558,9 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
if (hasStrings()) {
for (i=0; i<strings->size(); ++i) {
const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
//if (s.length() == 0) {
// // Empty strings match everything
// return TRUE;
//}
// assert(s.length() != 0); // We enforce this elsewhere
if (s.isEmpty()) {
continue; // skip the empty string
}
UChar32 c = s.char32At(0);
if ((c & 0xFF) == v) {
return TRUE;
@ -582,9 +579,6 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text,
int32_t limit,
UBool incremental) {
if (offset == limit) {
// Strings, if any, have length != 0, so we don't worry
// about them here. If we ever allow zero-length strings
// we much check for them here.
if (contains(U_ETHER)) {
return incremental ? U_PARTIAL_MATCH : U_MATCH;
} else {
@ -614,11 +608,9 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text,
for (i=0; i<strings->size(); ++i) {
const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
//if (trial.length() == 0) {
// return U_MATCH; // null-string always matches
//}
// assert(trial.length() != 0); // We ensure this elsewhere
if (trial.isEmpty()) {
continue; // skip the empty string
}
UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
@ -971,12 +963,12 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
* present. If this set already contains the multicharacter,
* the call leaves this set unchanged.
* Thus "ch" => {"ch"}
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
*
* @param s the source string
* @return the modified set, for chaining
*/
UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
if (s.length() == 0 || isFrozen() || isBogus()) return *this;
if (isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
if (!stringsContains(s)) {
@ -991,8 +983,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
/**
* Adds the given string, in order, to 'strings'. The given string
* must have been checked by the caller to not be empty and to not
* already be in 'strings'.
* must have been checked by the caller to not already be in 'strings'.
*/
void UnicodeSet::_add(const UnicodeString& s) {
if (isFrozen() || isBogus()) {
@ -1021,16 +1012,13 @@ void UnicodeSet::_add(const UnicodeString& s) {
* @param string to test
*/
int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
//if (s.length() < 1) {
// throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
//}
if (s.length() > 2) return -1;
if (s.length() == 1) return s.charAt(0);
// at this point, len = 2
UChar32 cp = s.char32At(0);
if (cp > 0xFFFF) { // is surrogate pair
return cp;
int32_t sLength = s.length();
if (sLength == 1) return s.charAt(0);
if (sLength == 2) {
UChar32 cp = s.char32At(0);
if (cp > 0xFFFF) { // is surrogate pair
return cp;
}
}
return -1;
}
@ -1186,7 +1174,7 @@ UnicodeSet& UnicodeSet::remove(UChar32 c) {
* @return the modified set, for chaining
*/
UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
if (s.length() == 0 || isFrozen() || isBogus()) return *this;
if (isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
if (strings != nullptr && strings->removeElement((void*) &s)) {
@ -1252,12 +1240,12 @@ UnicodeSet& UnicodeSet::complement(void) {
* Complement the specified string in this set.
* The set will not contain the specified string once the call
* returns.
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
*
* @param s the string to complement
* @return this object, for chaining
*/
UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
if (s.length() == 0 || isFrozen() || isBogus()) return *this;
if (isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
if (stringsContains(s)) {

View file

@ -555,7 +555,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
}
buf.append(c);
}
if (buf.length() < 1 || !ok) {
if (!ok) {
// syntaxError(chars, "Invalid multicharacter string");
ec = U_MALFORMED_SET;
return;

View file

@ -231,6 +231,9 @@ UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
const UChar *s16=string.getBuffer();
int32_t length16=string.length();
if (length16==0) {
continue; // skip the empty string
}
UBool thisRelevant;
spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
if(spanLength<length16) { // Relevant string.
@ -312,7 +315,7 @@ UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
const UChar *s16=string.getBuffer();
int32_t length16=string.length();
spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
if(spanLength<length16) { // Relevant string.
if(spanLength<length16 && length16>0) { // Relevant string.
if(which&UTF16) {
if(which&CONTAINED) {
if(which&FWD) {
@ -362,7 +365,7 @@ UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
addToSpanNotSet(c);
}
}
} else { // Irrelevant string.
} else { // Irrelevant string. (Also the empty string.)
if(which&UTF8) {
if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
uint8_t *s8=utf8+utf8Count;
@ -653,11 +656,12 @@ int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondi
for(i=0; i<stringsLength; ++i) {
int32_t overlap=spanLengths[i];
if(overlap==ALL_CP_CONTAINED) {
continue; // Irrelevant string.
continue; // Irrelevant string. (Also the empty string.)
}
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
const UChar *s16=string.getBuffer();
int32_t length16=string.length();
U_ASSERT(length>0);
// Try to match this string at pos-overlap..pos.
if(overlap>=LONG_SPAN) {
@ -697,6 +701,9 @@ int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondi
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
const UChar *s16=string.getBuffer();
int32_t length16=string.length();
if (length16==0) {
continue; // skip the empty string
}
// Try to match this string at pos-overlap..pos.
if(overlap>=LONG_SPAN) {
@ -817,11 +824,12 @@ int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanC
for(i=0; i<stringsLength; ++i) {
int32_t overlap=spanBackLengths[i];
if(overlap==ALL_CP_CONTAINED) {
continue; // Irrelevant string.
continue; // Irrelevant string. (Also the empty string.)
}
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
const UChar *s16=string.getBuffer();
int32_t length16=string.length();
U_ASSERT(length>0);
// Try to match this string at pos-(length16-overlap)..pos-length16.
if(overlap>=LONG_SPAN) {
@ -863,6 +871,9 @@ int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanC
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
const UChar *s16=string.getBuffer();
int32_t length16=string.length();
if (length16==0) {
continue; // skip the empty string
}
// Try to match this string at pos-(length16-overlap)..pos-length16.
if(overlap>=LONG_SPAN) {
@ -1358,11 +1369,12 @@ int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
// Try to match the strings at pos.
for(i=0; i<stringsLength; ++i) {
if(spanLengths[i]==ALL_CP_CONTAINED) {
continue; // Irrelevant string.
continue; // Irrelevant string. (Also the empty string.)
}
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
const UChar *s16=string.getBuffer();
int32_t length16=string.length();
U_ASSERT(length>0);
if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
return pos; // There is a set element at pos.
}
@ -1401,11 +1413,12 @@ int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const
// it is easier and we only need to know whether the string is irrelevant
// which is the same in either array.
if(spanLengths[i]==ALL_CP_CONTAINED) {
continue; // Irrelevant string.
continue; // Irrelevant string. (Also the empty string.)
}
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
const UChar *s16=string.getBuffer();
int32_t length16=string.length();
U_ASSERT(length>0);
if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
return pos; // There is a set element at pos.
}

View file

@ -98,6 +98,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE_AUTO(TestIntOverflow);
TESTCASE_AUTO(TestUnusedCcc);
TESTCASE_AUTO(TestDeepPattern);
TESTCASE_AUTO(TestEmptyString);
TESTCASE_AUTO_END;
}
@ -3984,3 +3985,46 @@ void UnicodeSetTest::TestDeepPattern() {
assertTrue("[a[a[a...1000s...]]] -> error", errorCode.isFailure());
errorCode.reset();
}
void UnicodeSetTest::TestEmptyString() {
IcuTestErrorCode errorCode(*this, "TestEmptyString");
// Starting with ICU 69, the empty string is allowed in UnicodeSet. ICU-13702
UnicodeSet set(u"[{}]", errorCode);
if (!assertSuccess("set from pattern with {}", errorCode)) { return; }
assertTrue("set from pattern with {}", set.contains(u""));
assertEquals("set from pattern with {}: size", 1, set.size());
assertFalse("set from pattern with {}: isEmpty", set.isEmpty());
// Remove, add back, ...
assertFalse("remove empty string", set.remove(u"").contains(u""));
assertEquals("remove empty string: size", 0, set.size());
assertTrue("remove empty string: isEmpty", set.isEmpty());
assertTrue("add empty string", set.add(u"").contains(u""));
// missing API -- assertTrue("retain empty string", set.retain(u"").contains(u""));
assertFalse("complement-remove empty string", set.complement(u"").contains(u""));
assertTrue("complement-add empty string", set.complement(u"").contains(u""));
assertFalse("clear", set.clear().contains(u""));
assertTrue("add empty string 2", set.add(u"").contains(u""));
assertFalse("removeAllStrings", set.removeAllStrings().contains(u""));
assertTrue("add empty string 3", set.add(u"").contains(u""));
// Note that this leaves the set containing exactly the empty string.
// strings() access and iteration
// no C++ equivalent for Java strings() -- assertTrue("strings()", set.strings().contains(u""));
UnicodeSetIterator sit(set);
assertTrue("set iterator.next()", sit.next());
assertTrue("set iterator has empty string", sit.isString() && sit.getString().isEmpty());
// The empty string is ignored in matching.
set.add(u'a').add(u'c');
assertEquals("span", 1, set.span(u"abc", 3, USET_SPAN_SIMPLE));
assertEquals("spanBack", 2, set.spanBack(u"abc", 3, USET_SPAN_SIMPLE));
assertTrue("containsNone", set.containsNone(u"def"));
assertFalse("containsSome", set.containsSome(u"def"));
set.freeze();
assertEquals("frozen span", 1, set.span(u"abc", 3, USET_SPAN_SIMPLE));
assertEquals("frozen spanBack", 2, set.spanBack(u"abc", 3, USET_SPAN_SIMPLE));
assertTrue("frozen containsNone", set.containsNone(u"def"));
assertFalse("frozen containsSome", set.containsSome(u"def"));
}

View file

@ -94,6 +94,7 @@ private:
void TestIntOverflow();
void TestUnusedCcc();
void TestDeepPattern();
void TestEmptyString();
private:

View file

@ -2,8 +2,6 @@
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.impl;
import static com.ibm.icu.impl.number.parse.ParsingUtils.safeContains;
import java.util.EnumMap;
import java.util.Map;
@ -95,7 +93,7 @@ public class StaticUnicodeSets {
* @return key1 if the set contains str, or COUNT if not.
*/
public static Key chooseFrom(String str, Key key1) {
return safeContains(get(key1), str) ? key1 : null;
return get(key1).contains(str) ? key1 : null;
}
/**
@ -113,7 +111,7 @@ public class StaticUnicodeSets {
* contains str.
*/
public static Key chooseFrom(String str, Key key1, Key key2) {
return safeContains(get(key1), str) ? key1 : chooseFrom(str, key2);
return get(key1).contains(str) ? key1 : chooseFrom(str, key2);
}
/**

View file

@ -110,9 +110,15 @@ public class UnicodeSetStringSpan {
int i, spanLength;
int maxLength16 = 0;
someRelevant = false;
for (i = 0; i < stringsLength; ++i) {
for (i = 0; i < stringsLength;) {
String string = strings.get(i);
int length16 = string.length();
if (length16 == 0) {
// Remove the empty string.
strings.remove(i);
--stringsLength;
continue;
}
spanLength = spanSet.span(string, SpanCondition.CONTAINED);
if (spanLength < length16) { // Relevant string.
someRelevant = true;
@ -120,6 +126,7 @@ public class UnicodeSetStringSpan {
if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
maxLength16 = length16;
}
++i;
}
this.maxLength16 = maxLength16;
if (!someRelevant && (which & WITH_COUNT) == 0) {

View file

@ -2,8 +2,6 @@
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.impl.number.parse;
import static com.ibm.icu.impl.number.parse.ParsingUtils.safeContains;
import com.ibm.icu.impl.StaticUnicodeSets;
import com.ibm.icu.impl.StringSegment;
import com.ibm.icu.text.DecimalFormatSymbols;
@ -18,7 +16,7 @@ public class InfinityMatcher extends SymbolMatcher {
public static InfinityMatcher getInstance(DecimalFormatSymbols symbols) {
String symbolString = symbols.getInfinity();
if (safeContains(DEFAULT.uniSet, symbolString)) {
if (DEFAULT.uniSet.contains(symbolString)) {
return DEFAULT;
} else {
return new InfinityMatcher(symbolString);

View file

@ -2,8 +2,6 @@
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.impl.number.parse;
import static com.ibm.icu.impl.number.parse.ParsingUtils.safeContains;
import com.ibm.icu.impl.StaticUnicodeSets;
import com.ibm.icu.impl.StringSegment;
import com.ibm.icu.text.DecimalFormatSymbols;
@ -19,7 +17,7 @@ public class MinusSignMatcher extends SymbolMatcher {
public static MinusSignMatcher getInstance(DecimalFormatSymbols symbols, boolean allowTrailing) {
String symbolString = symbols.getMinusSignString();
if (safeContains(DEFAULT.uniSet, symbolString)) {
if (DEFAULT.uniSet.contains(symbolString)) {
return allowTrailing ? DEFAULT_ALLOW_TRAILING : DEFAULT;
} else {
return new MinusSignMatcher(symbolString, allowTrailing);

View file

@ -33,7 +33,9 @@ public class ParsingUtils {
output.add(range.codepoint, range.codepointEnd);
}
for (String str : input.strings()) {
output.add(str.codePointAt(0));
if (!str.isEmpty()) {
output.add(str.codePointAt(0));
}
}
}
@ -42,10 +44,4 @@ public class ParsingUtils {
output.add(input.codePointAt(0));
}
}
// TODO: Remove this helper function (and update call sites) when #13805 is fixed
public static boolean safeContains(UnicodeSet uniset, CharSequence str) {
return str.length() != 0 && uniset.contains(str);
}
}

View file

@ -2,8 +2,6 @@
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.impl.number.parse;
import static com.ibm.icu.impl.number.parse.ParsingUtils.safeContains;
import com.ibm.icu.impl.StaticUnicodeSets;
import com.ibm.icu.impl.StringSegment;
import com.ibm.icu.text.DecimalFormatSymbols;
@ -19,7 +17,7 @@ public class PlusSignMatcher extends SymbolMatcher {
public static PlusSignMatcher getInstance(DecimalFormatSymbols symbols, boolean allowTrailing) {
String symbolString = symbols.getPlusSignString();
if (safeContains(DEFAULT.uniSet, symbolString)) {
if (DEFAULT.uniSet.contains(symbolString)) {
return allowTrailing ? DEFAULT_ALLOW_TRAILING : DEFAULT;
} else {
return new PlusSignMatcher(symbolString, allowTrailing);

View file

@ -2,8 +2,6 @@
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.impl.number.parse;
import static com.ibm.icu.impl.number.parse.ParsingUtils.safeContains;
import com.ibm.icu.impl.StaticUnicodeSets;
import com.ibm.icu.impl.StringSegment;
import com.ibm.icu.impl.number.DecimalQuantity_DualStorageBCD;
@ -36,9 +34,9 @@ public class ScientificMatcher implements NumberParseMatcher {
ignorablesMatcher = IgnorablesMatcher.getInstance(ParsingUtils.PARSE_FLAG_STRICT_IGNORABLES);
String minusSign = symbols.getMinusSignString();
customMinusSign = safeContains(minusSignSet(), minusSign) ? null : minusSign;
customMinusSign = minusSignSet().contains(minusSign) ? null : minusSign;
String plusSign = symbols.getPlusSignString();
customPlusSign = safeContains(plusSignSet(), plusSign) ? null : plusSign;
customPlusSign = plusSignSet().contains(plusSign) ? null : plusSign;
}
private static UnicodeSet minusSignSet() {

View file

@ -186,8 +186,6 @@ import com.ibm.icu.util.VersionInfo;
* Unicode property
* </table>
*
* <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
*
* <p><b>Formal syntax</b></p>
*
* <blockquote>
@ -892,11 +890,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
}
if (hasStrings()) {
for (String s : strings) {
//if (s.length() == 0) {
// // Empty strings match everything
// return true;
//}
// assert(s.length() != 0); // We enforce this elsewhere
if (s.isEmpty()) {
continue; // skip the empty string
}
int c = UTF16.charAt(s, 0);
if ((c & 0xFF) == v) {
return true;
@ -918,9 +914,6 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
boolean incremental) {
if (offset[0] == limit) {
// Strings, if any, have length != 0, so we don't worry
// about them here. If we ever allow zero-length strings
// we much check for them here.
if (contains(UnicodeMatcher.ETHER)) {
return incremental ? U_PARTIAL_MATCH : U_MATCH;
} else {
@ -948,10 +941,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
int highWaterLength = 0;
for (String trial : strings) {
//if (trial.length() == 0) {
// return U_MATCH; // null-string always matches
//}
// assert(trial.length() != 0); // We ensure this elsewhere
if (trial.isEmpty()) {
continue; // skip the empty string
}
char c = trial.charAt(forward ? 0 : trial.length() - 1);
@ -1363,7 +1355,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* present. If this set already contains the multicharacter,
* the call leaves this set unchanged.
* Thus "ch" =&gt; {"ch"}
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
*
* @param s the source string
* @return this object, for chaining
* @stable ICU 2.0
@ -1392,22 +1384,19 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
/**
* Utility for getting code point from single code point CharSequence.
* See the public UTF16.getSingleCodePoint()
* See the public UTF16.getSingleCodePoint() (which returns -1 for null rather than throwing NPE).
*
* @return a code point IF the string consists of a single one.
* otherwise returns -1.
* @param s to test
*/
private static int getSingleCP(CharSequence s) {
if (s.length() < 1) {
throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
}
if (s.length() > 2) return -1;
if (s.length() == 1) return s.charAt(0);
// at this point, len = 2
int cp = UTF16.charAt(s, 0);
if (cp > 0xFFFF) { // is surrogate pair
return cp;
if (s.length() == 2) {
int cp = Character.codePointAt(s, 0);
if (cp > 0xFFFF) { // is surrogate pair
return cp;
}
}
return -1;
}
@ -1478,7 +1467,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
/**
* Makes a set from a multicharacter string. Thus "ch" =&gt; {"ch"}
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
*
* @param s the source string
* @return a newly created set containing the given string
* @stable ICU 2.0
@ -1686,7 +1675,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* Complement the specified string in this set.
* The set will not contain the specified string once the call
* returns.
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
*
* @param s the string to complement
* @return this object, for chaining
* @stable ICU 2.0
@ -2056,7 +2045,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
return true;
}
for (String setStr : strings) {
if (s.startsWith(setStr, i) && containsAll(s, i+setStr.length())) {
if (!setStr.isEmpty() && // skip the empty string
s.startsWith(setStr, i) && containsAll(s, i+setStr.length())) {
return true;
}
}
@ -2801,7 +2791,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
}
appendCodePoint(buf, c);
}
if (buf.length() < 1 || !ok) {
if (!ok) {
syntaxError(chars, "Invalid multicharacter string");
}
// We have new string. Add it to set and continue;

View file

@ -1528,12 +1528,12 @@ public class UnicodeSetTest extends TestFmwk {
//public Iterator<String> iterator() {
ArrayList<String> oldList = new ArrayList<String>();
ArrayList<String> oldList = new ArrayList<>();
for (UnicodeSetIterator it = new UnicodeSetIterator(set1); it.next();) {
oldList.add(it.getString());
}
ArrayList<String> list1 = new ArrayList<String>();
ArrayList<String> list1 = new ArrayList<>();
for (String s : set1) {
list1.add(s);
}
@ -1613,11 +1613,11 @@ public class UnicodeSetTest extends TestFmwk {
List<UnicodeSet> goalLongest = Arrays.asList(set1, set3, set2);
List<UnicodeSet> goalLex = Arrays.asList(set1, set2, set3);
List<UnicodeSet> sorted = new ArrayList(new TreeSet<UnicodeSet>(unsorted));
List<UnicodeSet> sorted = new ArrayList(new TreeSet<>(unsorted));
assertNotEquals("compareTo-shorter-first", unsorted, sorted);
assertEquals("compareTo-shorter-first", goalShortest, sorted);
TreeSet<UnicodeSet> sorted1 = new TreeSet<UnicodeSet>(new Comparator<UnicodeSet>(){
TreeSet<UnicodeSet> sorted1 = new TreeSet<>(new Comparator<UnicodeSet>(){
@Override
public int compare(UnicodeSet o1, UnicodeSet o2) {
// TODO Auto-generated method stub
@ -1628,7 +1628,7 @@ public class UnicodeSetTest extends TestFmwk {
assertNotEquals("compareTo-longer-first", unsorted, sorted);
assertEquals("compareTo-longer-first", goalLongest, sorted);
sorted1 = new TreeSet<UnicodeSet>(new Comparator<UnicodeSet>(){
sorted1 = new TreeSet<>(new Comparator<UnicodeSet>(){
@Override
public int compare(UnicodeSet o1, UnicodeSet o2) {
// TODO Auto-generated method stub
@ -1642,7 +1642,7 @@ public class UnicodeSetTest extends TestFmwk {
//compare(String, int)
// make a list of interesting combinations
List<String> sources = Arrays.asList("\u0000", "a", "b", "\uD7FF", "\uD800", "\uDBFF", "\uDC00", "\uDFFF", "\uE000", "\uFFFD", "\uFFFF");
TreeSet<String> target = new TreeSet<String>();
TreeSet<String> target = new TreeSet<>();
for (String s : sources) {
target.add(s);
for (String t : sources) {
@ -1685,8 +1685,8 @@ public class UnicodeSetTest extends TestFmwk {
//compare(Iterable<T>, Iterable<T>)
int max = 10;
List<String> test1 = new ArrayList<String>(max);
List<String> test2 = new ArrayList<String>(max);
List<String> test1 = new ArrayList<>(max);
List<String> test2 = new ArrayList<>(max);
for (int i = 0; i <= max; ++i) {
test1.add("a" + i);
test2.add("a" + (max - i)); // add in reverse order
@ -2792,4 +2792,47 @@ public class UnicodeSetTest extends TestFmwk {
} catch(RuntimeException expected) {
}
}
@Test
public void TestEmptyString() {
// Starting with ICU 69, the empty string is allowed in UnicodeSet. ICU-13702
UnicodeSet set = new UnicodeSet("[{}]");
assertTrue("set from pattern with {}", set.contains(""));
assertEquals("set from pattern with {}: size", 1, set.size());
assertFalse("set from pattern with {}: isEmpty", set.isEmpty());
// Remove, add back, ...
assertFalse("remove empty string", set.remove("").contains(""));
assertEquals("remove empty string: size", 0, set.size());
assertTrue("remove empty string: isEmpty", set.isEmpty());
assertTrue("add empty string", set.add("").contains(""));
assertTrue("retain empty string", set.retain("").contains(""));
assertFalse("complement-remove empty string", set.complement("").contains(""));
assertTrue("complement-add empty string", set.complement("").contains(""));
assertFalse("clear", set.clear().contains(""));
assertTrue("add empty string 2", set.add("").contains(""));
assertFalse("removeAllStrings", set.removeAllStrings().contains(""));
assertTrue("add empty string 3", set.add("").contains(""));
// Note that this leaves the set containing exactly the empty string.
// strings() access and iteration
assertTrue("strings()", set.strings().contains(""));
UnicodeSetIterator sit = new UnicodeSetIterator(set);
assertTrue("set iterator.next()", sit.next());
assertTrue("set iterator has empty string",
sit.codepoint == UnicodeSetIterator.IS_STRING && sit.getString().isEmpty());
// The empty string is ignored in matching.
set.add('a').add('c');
assertEquals("span", 1, set.span("abc", SpanCondition.SIMPLE));
assertEquals("spanBack", 2, set.spanBack("abc", SpanCondition.SIMPLE));
assertTrue("containsNone", set.containsNone("def"));
assertFalse("containsSome", set.containsSome("def"));
set.freeze();
assertEquals("frozen span", 1, set.span("abc", SpanCondition.SIMPLE));
assertEquals("frozen spanBack", 2, set.spanBack("abc", SpanCondition.SIMPLE));
assertTrue("frozen containsNone", set.containsNone("def"));
assertFalse("frozen containsSome", set.containsSome("def"));
}
}

View file

@ -562,12 +562,12 @@ public final class StringTokenizerTest extends TestFmwk
us._generatePattern(sb.append(1.0), true);
us._generatePattern(sb.reverse(), true);
} catch(Exception e){
errln("UnicodeSet._generatePattern is not suppose to return an exception.");
errln("UnicodeSet._generatePattern is not supposed to return an exception.");
}
try{
us._generatePattern(null, true);
errln("UnicodeSet._generatePattern is suppose to return an exception.");
errln("UnicodeSet._generatePattern is supposed to return an exception.");
} catch(Exception e){}
}
@ -585,12 +585,12 @@ public final class StringTokenizerTest extends TestFmwk
int limit = 0;
if(us.matches(null, offset, limit, true) != UnicodeSet.U_PARTIAL_MATCH){
errln("UnicodeSet.matches is suppose to return " + UnicodeSet.U_PARTIAL_MATCH +
errln("UnicodeSet.matches is supposed to return " + UnicodeSet.U_PARTIAL_MATCH +
" but got " + us.matches(null, offset, limit, true));
}
if(us.matches(null, offset, limit, false) != UnicodeSet.U_MATCH){
errln("UnicodeSet.matches is suppose to return " + UnicodeSet.U_MATCH +
errln("UnicodeSet.matches is supposed to return " + UnicodeSet.U_MATCH +
" but got " + us.matches(null, offset, limit, false));
}
@ -601,7 +601,7 @@ public final class StringTokenizerTest extends TestFmwk
offset[0] = 4; // Takes the letter "y"
us.matches(rs, offset, 1, true);
} catch(Exception e) {
errln("UnicodeSet.matches is not suppose to return an exception");
errln("UnicodeSet.matches is not supposed to return an exception");
}
// TODO: Tests when "if (forward && length < highWaterLength)" is true
@ -650,7 +650,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.indexOf(invalid[i]);
errln("UnicodeSet.indexOf is suppose to return an exception " +
errln("UnicodeSet.indexOf is supposed to return an exception " +
"for a value of " + invalid[i]);
} catch(Exception e){}
}
@ -659,7 +659,7 @@ public final class StringTokenizerTest extends TestFmwk
try{
us.indexOf(valid[i]);
} catch(Exception e){
errln("UnicodeSet.indexOf is not suppose to return an exception " +
errln("UnicodeSet.indexOf is not supposed to return an exception " +
"for a value of " + valid[i]);
}
}
@ -676,7 +676,7 @@ public final class StringTokenizerTest extends TestFmwk
int[] invalid = {-100,-10,-5,-2,-1};
for(int i=0; i < invalid.length; i++){
if(us.charAt(invalid[i]) != -1){
errln("UnicodeSet.charAt(int index) was suppose to return -1 "
errln("UnicodeSet.charAt(int index) was supposed to return -1 "
+ "for an invalid input of " + invalid[i]);
}
}
@ -696,7 +696,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.add(invalid[i], UnicodeSet.MAX_VALUE);
errln("UnicodeSet.add(int start, int end) was suppose to give "
errln("UnicodeSet.add(int start, int end) was supposed to give "
+ "an exception for an start invalid input of "
+ invalid[i]);
} catch (Exception e){}
@ -706,7 +706,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.add(UnicodeSet.MIN_VALUE, invalid[i]);
errln("UnicodeSet.add(int start, int end) was suppose to give "
errln("UnicodeSet.add(int start, int end) was supposed to give "
+ "an exception for an end invalid input of "
+ invalid[i]);
} catch (Exception e){}
@ -714,12 +714,12 @@ public final class StringTokenizerTest extends TestFmwk
// Tests when "else if (start == end)" is false
if(!(us.add(UnicodeSet.MIN_VALUE+1, UnicodeSet.MIN_VALUE).equals(us)))
errln("UnicodeSet.add(int start, int end) was suppose to return "
errln("UnicodeSet.add(int start, int end) was supposed to return "
+ "the same object because start of value " + (UnicodeSet.MIN_VALUE+1)
+ " is greater than end of value " + UnicodeSet.MIN_VALUE);
if(!(us.add(UnicodeSet.MAX_VALUE, UnicodeSet.MAX_VALUE-1).equals(us)))
errln("UnicodeSet.add(int start, int end) was suppose to return "
errln("UnicodeSet.add(int start, int end) was supposed to return "
+ "the same object because start of value " + UnicodeSet.MAX_VALUE
+ " is greater than end of value " + (UnicodeSet.MAX_VALUE-1));
}
@ -738,7 +738,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.add(invalid[i]);
errln("UnicodeSet.add(int c) was suppose to give "
errln("UnicodeSet.add(int c) was supposed to give "
+ "an exception for an start invalid input of "
+ invalid[i]);
} catch (Exception e){}
@ -758,14 +758,15 @@ public final class StringTokenizerTest extends TestFmwk
// Tests when "if (s.length() < 1)" is true
try{
us.contains("");
errln("UnicodeSet.getSingleCP is suppose to give an exception for " +
} catch (Exception e) {
errln("UnicodeSet.getSingleCP is not supposed to give an exception for " +
"an empty string.");
} catch (Exception e){}
}
try{
us.contains((String)null);
errln("UnicodeSet.getSingleCP is suppose to give an exception for " +
"a null string.");
errln("UnicodeSet.getSingleCP is supposed to give an exception for " +
"a null string.");
} catch (Exception e){}
// Tests when "if (cp > 0xFFFF)" is true
@ -774,8 +775,8 @@ public final class StringTokenizerTest extends TestFmwk
try{
us.contains(cases[i]);
} catch (Exception e){
errln("UnicodeSet.getSingleCP is not suppose to give an exception for " +
"a null string.");
errln("UnicodeSet.getSingleCP is not supposed to give an exception for " +
"a surrogate pair.");
}
}
}
@ -790,7 +791,7 @@ public final class StringTokenizerTest extends TestFmwk
try{
us.removeAllStrings();
} catch(Exception e){
errln("UnicodeSet.removeAllString() was not suppose to given an " +
errln("UnicodeSet.removeAllString() was not supposed to given an " +
"exception for a strings size of 0");
}
}
@ -808,7 +809,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.retain(invalid[i], UnicodeSet.MAX_VALUE);
errln("UnicodeSet.retain(int start, int end) was suppose to give "
errln("UnicodeSet.retain(int start, int end) was supposed to give "
+ "an exception for an start invalid input of "
+ invalid[i]);
} catch (Exception e){}
@ -818,7 +819,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.retain(UnicodeSet.MIN_VALUE, invalid[i]);
errln("UnicodeSet.retain(int start, int end) was suppose to give "
errln("UnicodeSet.retain(int start, int end) was supposed to give "
+ "an exception for an end invalid input of "
+ invalid[i]);
} catch (Exception e){}
@ -828,14 +829,14 @@ public final class StringTokenizerTest extends TestFmwk
try{
us.retain(UnicodeSet.MIN_VALUE+1, UnicodeSet.MIN_VALUE);
} catch(Exception e){
errln("UnicodeSet.retain(int start, int end) was not suppose to give "
errln("UnicodeSet.retain(int start, int end) was not supposed to give "
+ "an exception.");
}
try{
us.retain(UnicodeSet.MAX_VALUE, UnicodeSet.MAX_VALUE-1);
} catch(Exception e){
errln("UnicodeSet.retain(int start, int end) was not suppose to give "
errln("UnicodeSet.retain(int start, int end) was not supposed to give "
+ "an exception.");
}
}
@ -849,7 +850,7 @@ public final class StringTokenizerTest extends TestFmwk
UnicodeSet us = new UnicodeSet();
us.add("dummy");
if(!(us.retain("dummy").equals(us))){
errln("UnicodeSet.retain(String s) was suppose to return the " +
errln("UnicodeSet.retain(String s) was supposed to return the " +
"same UnicodeSet since the string was found in the original.");
}
}
@ -867,7 +868,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.remove(invalid[i], UnicodeSet.MAX_VALUE);
errln("UnicodeSet.remove(int start, int end) was suppose to give "
errln("UnicodeSet.remove(int start, int end) was supposed to give "
+ "an exception for an start invalid input of "
+ invalid[i]);
} catch (Exception e){}
@ -877,7 +878,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.remove(UnicodeSet.MIN_VALUE, invalid[i]);
errln("UnicodeSet.remove(int start, int end) was suppose to give "
errln("UnicodeSet.remove(int start, int end) was supposed to give "
+ "an exception for an end invalid input of "
+ invalid[i]);
} catch (Exception e){}
@ -887,14 +888,14 @@ public final class StringTokenizerTest extends TestFmwk
try{
us.remove(UnicodeSet.MIN_VALUE+1, UnicodeSet.MIN_VALUE);
} catch(Exception e){
errln("UnicodeSet.remove(int start, int end) was not suppose to give "
errln("UnicodeSet.remove(int start, int end) was not supposed to give "
+ "an exception.");
}
try{
us.remove(UnicodeSet.MAX_VALUE, UnicodeSet.MAX_VALUE-1);
} catch(Exception e){
errln("UnicodeSet.remove(int start, int end) was not suppose to give "
errln("UnicodeSet.remove(int start, int end) was not supposed to give "
+ "an exception.");
}
}
@ -912,7 +913,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.complement(invalid[i], UnicodeSet.MAX_VALUE);
errln("UnicodeSet.complement(int start, int end) was suppose to give "
errln("UnicodeSet.complement(int start, int end) was supposed to give "
+ "an exception for an start invalid input of "
+ invalid[i]);
} catch (Exception e){}
@ -922,7 +923,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.complement(UnicodeSet.MIN_VALUE, invalid[i]);
errln("UnicodeSet.complement(int start, int end) was suppose to give "
errln("UnicodeSet.complement(int start, int end) was supposed to give "
+ "an exception for an end invalid input of "
+ invalid[i]);
} catch (Exception e){}
@ -932,14 +933,14 @@ public final class StringTokenizerTest extends TestFmwk
try{
us.complement(UnicodeSet.MIN_VALUE+1, UnicodeSet.MIN_VALUE);
} catch(Exception e){
errln("UnicodeSet.complement(int start, int end) was not suppose to give "
errln("UnicodeSet.complement(int start, int end) was not supposed to give "
+ "an exception.");
}
try{
us.complement(UnicodeSet.MAX_VALUE, UnicodeSet.MAX_VALUE-1);
} catch(Exception e){
errln("UnicodeSet.complement(int start, int end) was not suppose to give "
errln("UnicodeSet.complement(int start, int end) was not supposed to give "
+ "an exception.");
}
}
@ -955,7 +956,7 @@ public final class StringTokenizerTest extends TestFmwk
try{
us.complement("dummy");
} catch (Exception e){
errln("UnicodeSet.complement(String s) was not suppose to give "
errln("UnicodeSet.complement(String s) was not supposed to give "
+ "an exception for 'dummy'.");
}
@ -965,7 +966,7 @@ public final class StringTokenizerTest extends TestFmwk
try{
us.complement("\uDC11");
} catch (Exception e){
errln("UnicodeSet.complement(String s) was not suppose to give "
errln("UnicodeSet.complement(String s) was not supposed to give "
+ "an exception for '\uDC11'.");
}
}
@ -983,7 +984,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.contains(invalid[i]);
errln("UnicodeSet.contains(int c) was suppose to give "
errln("UnicodeSet.contains(int c) was supposed to give "
+ "an exception for an start invalid input of "
+ invalid[i]);
} catch (Exception e){}
@ -1003,7 +1004,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.contains(invalid[i], UnicodeSet.MAX_VALUE);
errln("UnicodeSet.contains(int start, int end) was suppose to give "
errln("UnicodeSet.contains(int start, int end) was supposed to give "
+ "an exception for an start invalid input of "
+ invalid[i]);
} catch (Exception e){}
@ -1013,7 +1014,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.contains(UnicodeSet.MIN_VALUE, invalid[i]);
errln("UnicodeSet.contains(int start, int end) was suppose to give "
errln("UnicodeSet.contains(int start, int end) was supposed to give "
+ "an exception for an end invalid input of "
+ invalid[i]);
} catch (Exception e){}
@ -1028,7 +1029,7 @@ public final class StringTokenizerTest extends TestFmwk
UnicodeSet us = new UnicodeSet();
String res = us.getRegexEquivalent();
if(!(res.equals("[]")))
errln("UnicodeSet.getRegexEquivalent is suppose to return '[]' " +
errln("UnicodeSet.getRegexEquivalent is supposed to return '[]' " +
"but got " + res);
}
@ -1045,7 +1046,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.containsNone(invalid[i], UnicodeSet.MAX_VALUE);
errln("UnicodeSet.containsNoneint start, int end) was suppose to give "
errln("UnicodeSet.containsNoneint start, int end) was supposed to give "
+ "an exception for an start invalid input of "
+ invalid[i]);
} catch (Exception e){}
@ -1055,7 +1056,7 @@ public final class StringTokenizerTest extends TestFmwk
for(int i=0; i < invalid.length; i++){
try{
us.containsNone(UnicodeSet.MIN_VALUE, invalid[i]);
errln("UnicodeSet.containsNone(int start, int end) was suppose to give "
errln("UnicodeSet.containsNone(int start, int end) was supposed to give "
+ "an exception for an end invalid input of "
+ invalid[i]);
} catch (Exception e){}
@ -1066,7 +1067,7 @@ public final class StringTokenizerTest extends TestFmwk
us.add(0);
us.containsNone(1, 2); // 1 > 0
} catch (Exception e){
errln("UnicodeSet.containsNone(int start, int end) was not suppose to give " +
errln("UnicodeSet.containsNone(int start, int end) was not supposed to give " +
"an exception.");
}
}