ICU-21524 UnicodeSet pattern parser: code point complement

This commit is contained in:
Markus Scherer 2021-09-14 13:04:27 -07:00
parent 7bc2009f7f
commit f026e967f6
8 changed files with 186 additions and 20 deletions

View file

@ -136,6 +136,13 @@ class RuleCharacterIterator;
* their delimiters; "[:^foo]" and "\\P{foo}". In any other location,
* '^' has no special meaning.
*
* <p>Since ICU 70, "[^...]", "[:^foo]", "\\P{foo}", and "[:binaryProperty=No:]"
* perform a code point complement (all code points minus the original set),
* removing all multicharacter strings,
* equivalent to <code>.complement().removeAllStrings()</code>.
* The complement() API function continues to perform a
* symmetric difference with all code points and thus retains all multicharacter strings.
*
* <p>Ranges are indicated by placing two a '-' between two
* characters, as in "a-z". This specifies the range of all
* characters from the left to the right, in Unicode order. If the
@ -1275,13 +1282,18 @@ public:
UnicodeSet& remove(const UnicodeString& s);
/**
* Inverts this set. This operation modifies this set so that
* its value is its complement. This is equivalent to
* This is equivalent to
* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
*
* <strong>Note:</strong> This performs a symmetric difference with all code points
* <em>and thus retains all multicharacter strings</em>.
* In order to achieve a code point complement (all code points minus this set),
* the easiest is to <code>.complement().removeAllStrings()</code>.
*
* A frozen set will not be modified.
* @stable ICU 2.0
*/
virtual UnicodeSet& complement(void);
virtual UnicodeSet& complement();
/**
* Complements the specified range in this set. Any character in

View file

@ -726,9 +726,14 @@ U_CAPI void U_EXPORT2
uset_compact(USet* set);
/**
* Inverts this set. This operation modifies this set so that
* its value is its complement. This operation does not affect
* the multicharacter strings, if any.
* This is equivalent to
* <code>uset_complementRange(set, 0, 0x10FFFF)</code>.
*
* <strong>Note:</strong> This performs a symmetric difference with all code points
* <em>and thus retains all multicharacter strings</em>.
* In order to achieve a code point complement (all code points minus this set),
* the easiest is to <code>uset_complement(set); uset_removeAllStrings(set);</code>.
*
* A frozen set will not be modified.
* @param set the set
* @stable ICU 2.4

View file

@ -638,7 +638,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
(this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
}
if (invert) {
complement();
complement().removeAllStrings(); // code point complement
}
// Use the rebuilt pattern (patLocal) only if necessary. Prefer the
@ -791,7 +791,7 @@ UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec)
if (U_FAILURE(ec)) { return *this; }
copyFrom(*UnicodeSet::fromUSet(set), TRUE);
if (value == 0) {
complement();
complement().removeAllStrings(); // code point complement
}
} else {
clear();
@ -958,7 +958,7 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
applyIntPropertyValue(p, v, ec);
if(invert) {
complement();
complement().removeAllStrings(); // code point complement
}
if (isBogus() && U_SUCCESS(ec)) {
@ -1101,9 +1101,9 @@ UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
if (U_SUCCESS(ec)) {
if (invert) {
complement();
complement().removeAllStrings(); // code point complement
}
// Move to the limit position after the close delimiter if the
// parse succeeded.
ppos.setIndex(close + (posix ? 2 : 1));

View file

@ -4580,6 +4580,13 @@ UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB
} while (false); // End of do loop block. Code above breaks out of the block on success or hard failure.
if (U_SUCCESS(status)) {
// ICU 70 adds emoji properties of strings, but as long as Java does not say how to
// deal with properties of strings and character classes with strings, we ignore them.
// Just in case something downstream might stumble over the strings,
// we remove them from the set.
// Note that when we support strings, the complement of a property (as with \P)
// should be implemented as .complement().removeAllStrings() (code point complement).
set->removeAllStrings();
U_ASSERT(set.isValid());
if (negated) {
set->complement();
@ -4613,6 +4620,13 @@ void RegexCompile::setEval(int32_t nextOp) {
fSetOpStack.popi();
U_ASSERT(fSetStack.empty() == FALSE);
rightOperand = (UnicodeSet *)fSetStack.peek();
// ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings
// (see comments there).
// We also do not yet support string literals in character classes,
// so there should not be any strings.
// Note that when we support strings, the complement of a set (as with ^ or \P)
// should be implemented as .complement().removeAllStrings() (code point complement).
U_ASSERT(!rightOperand->hasStrings());
switch (pendingSetOperation) {
case setNegation:
rightOperand->complement();

View file

@ -100,6 +100,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE_AUTO(TestDeepPattern);
TESTCASE_AUTO(TestEmptyString);
TESTCASE_AUTO(TestSkipToStrings);
TESTCASE_AUTO(TestPatternCodePointComplement);
TESTCASE_AUTO_END;
}
@ -4095,3 +4096,65 @@ void UnicodeSetTest::TestSkipToStrings() {
assertNext(iter, u"ch");
assertFalse("no next", iter.next());
}
void UnicodeSetTest::TestPatternCodePointComplement() {
IcuTestErrorCode errorCode(*this, "TestPatternCodePointComplement");
// ICU-21524 changes pattern ^ and equivalent functions to perform a "code point complement".
// [^abc{ch}] = [[:Any:]-[abc{ch}]] which removes all strings.
{
UnicodeSet simple(u"[^abc{ch}]", errorCode);
assertEquals("[^abc{ch}] --> lots of elements", 0x110000 - 3, simple.size());
assertFalse("[^abc{ch}] --> no strings", simple.hasStrings());
assertFalse("[^abc{ch}] --> no 'a'", simple.contains(u'a'));
}
{
UnicodeSet notBasic(u"[:^Basic_Emoji:]", errorCode);
if (errorCode.errDataIfFailureAndReset("[:^Basic_Emoji:]")) {
return;
}
assertTrue("[:^Basic_Emoji:] --> lots of elements", notBasic.size() > 1000);
assertFalse("[:^Basic_Emoji:] --> no strings", notBasic.hasStrings());
assertFalse("[:^Basic_Emoji:] --> no bicycle", notBasic.contains(U'🚲'));
}
{
UnicodeSet notBasic(u"[:Basic_Emoji=No:]", errorCode);
assertTrue("[:Basic_Emoji=No:] --> lots of elements", notBasic.size() > 1000);
assertFalse("[:Basic_Emoji=No:] --> no strings", notBasic.hasStrings());
assertFalse("[:Basic_Emoji=No:] --> no bicycle", notBasic.contains(U'🚲'));
}
{
UnicodeSet notBasic;
notBasic.applyIntPropertyValue(UCHAR_BASIC_EMOJI, 0, errorCode);
assertTrue("[].applyIntPropertyValue(Basic_Emoji, 0) --> lots of elements",
notBasic.size() > 1000);
assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no strings",
notBasic.hasStrings());
assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no bicycle",
notBasic.contains(U'🚲'));
}
{
UnicodeSet notBasic;
notBasic.applyPropertyAlias("Basic_Emoji", "No", errorCode);
assertTrue("[].applyPropertyAlias(Basic_Emoji, No) --> lots of elements",
notBasic.size() > 1000);
assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no strings",
notBasic.hasStrings());
assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no bicycle",
notBasic.contains(U'🚲'));
}
// The complement() API behavior does not change under this ticket.
{
UnicodeSet notBasic(u"[:Basic_Emoji:]", errorCode);
notBasic.complement();
assertTrue("[:Basic_Emoji:].complement() --> lots of elements", notBasic.size() > 1000);
assertTrue("[:Basic_Emoji:].complement() --> has strings", notBasic.hasStrings());
assertTrue("[:Basic_Emoji:].complement().contains(chipmunk+emoji)",
notBasic.contains(u"🐿\uFE0F"));
assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains(U'🚲'));
}
}

View file

@ -99,6 +99,7 @@ private:
void assertNext(UnicodeSetIterator &iter, const UnicodeString &expected);
void TestSkipToStrings();
void TestPatternCodePointComplement();
private:

View file

@ -144,6 +144,13 @@ import com.ibm.icu.util.VersionInfo;
* their delimiters; "[:^foo]" and "\P{foo}". In any other location,
* '^' has no special meaning.
*
* <p>Since ICU 70, "[^...]", "[:^foo]", "\P{foo}", and "[:binaryProperty=No:]"
* perform a code point complement (all code points minus the original set),
* removing all multicharacter strings,
* equivalent to .{@link #complement()}.{@link #removeAllStrings()} .
* The {@link #complement()} API function continues to perform a
* symmetric difference with all code points and thus retains all multicharacter strings.
*
* <p>Ranges are indicated by placing two a '-' between two
* characters, as in "a-z". This specifies the range of all
* characters from the left to the right, in Unicode order. If the
@ -1689,6 +1696,12 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
/**
* This is equivalent to
* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
*
* <p><strong>Note:</strong> This performs a symmetric difference with all code points
* <em>and thus retains all multicharacter strings</em>.
* In order to achieve a code point complement (all code points minus this set),
* the easiest is to .{@link #complement()}.{@link #removeAllStrings()} .
*
* @stable ICU 2.0
*/
public UnicodeSet complement() {
@ -2953,7 +2966,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
closeOver(CASE);
}
if (invert) {
complement();
complement().removeAllStrings(); // code point complement
}
// Use the rebuilt pattern (pat) only if necessary. Prefer the
@ -3474,7 +3487,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
if (value == 0 || value == 1) {
set(CharacterProperties.getBinaryPropertySet(prop));
if (value == 0) {
complement();
complement().removeAllStrings(); // code point complement
}
} else {
clear();
@ -3670,7 +3683,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
applyIntPropertyValue(p, v);
if(invert) {
complement();
complement().removeAllStrings(); // code point complement
}
return this;
@ -3798,7 +3811,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
applyPropertyAlias(propName, valueName, symbols);
if (invert) {
complement();
complement().removeAllStrings(); // code point complement
}
// Move to the limit position after the close delimiter
@ -4768,9 +4781,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
*/
@Deprecated
public UnicodeSet addBridges(UnicodeSet dontCare) {
UnicodeSet notInInput = new UnicodeSet(this).complement();
UnicodeSet notInInput = new UnicodeSet(this).complement().removeAllStrings();
for (UnicodeSetIterator it = new UnicodeSetIterator(notInInput); it.nextRange();) {
if (it.codepoint != 0 && it.codepoint != UnicodeSetIterator.IS_STRING && it.codepointEnd != 0x10FFFF && dontCare.contains(it.codepoint,it.codepointEnd)) {
if (it.codepoint != 0 && it.codepointEnd != 0x10FFFF &&
dontCare.contains(it.codepoint, it.codepointEnd)) {
add(it.codepoint,it.codepointEnd);
}
}

View file

@ -153,12 +153,11 @@ public class UnicodeSetTest extends TestFmwk {
UnicodeSet collectedErrors = new UnicodeSet();
for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.next();) {
if (it.codepoint == UnicodeSetIterator.IS_STRING) {
// For binary properties of strings, only [:propName=true:] _should_ yield strings.
// For binary properties of strings, only [:propName=true:] yields strings.
// Therefore, we should always have valueNum=1 and b=true.
// TODO: ICU-21524 ^ and propName=N use complement() which leaves strings alone.
boolean b = UCharacter.hasBinaryProperty(it.string, propNum);
int value = b ? 1 : 0;
if (value != valueNum && /* TODO: ICU-21524 */ valueNum != 0) {
if (value != valueNum) {
collectedErrors.add(it.string);
}
} else {
@ -2924,4 +2923,62 @@ public class UnicodeSetTest extends TestFmwk {
assertNext(iter, "ch");
assertFalse("no next", iter.next());
}
@Test
public void TestPatternCodePointComplement() {
// ICU-21524 changes pattern ^ and equivalent functions to perform a "code point complement".
// [^abc{ch}] = [[:Any:]-[abc{ch}]] which removes all strings.
{
UnicodeSet simple = new UnicodeSet("[^abc{ch}]");
assertEquals("[^abc{ch}] --> lots of elements", 0x110000 - 3, simple.size());
assertFalse("[^abc{ch}] --> no strings", simple.hasStrings());
assertFalse("[^abc{ch}] --> no 'a'", simple.contains('a'));
}
{
UnicodeSet notBasic = new UnicodeSet("[:^Basic_Emoji:]");
assertTrue("[:^Basic_Emoji:] --> lots of elements", notBasic.size() > 1000);
assertFalse("[:^Basic_Emoji:] --> no strings", notBasic.hasStrings());
assertFalse("[:^Basic_Emoji:] --> no bicycle", notBasic.contains("🚲"));
}
{
UnicodeSet notBasic = new UnicodeSet("[:Basic_Emoji=No:]");
assertTrue("[:Basic_Emoji=No:] --> lots of elements", notBasic.size() > 1000);
assertFalse("[:Basic_Emoji=No:] --> no strings", notBasic.hasStrings());
assertFalse("[:Basic_Emoji=No:] --> no bicycle", notBasic.contains("🚲"));
}
{
UnicodeSet notBasic = new UnicodeSet();
notBasic.applyIntPropertyValue(UProperty.BASIC_EMOJI, 0);
assertTrue("[].applyIntPropertyValue(Basic_Emoji, 0) --> lots of elements",
notBasic.size() > 1000);
assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no strings",
notBasic.hasStrings());
assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no bicycle",
notBasic.contains("🚲"));
}
{
UnicodeSet notBasic = new UnicodeSet();
notBasic.applyPropertyAlias("Basic_Emoji", "No");
assertTrue("[].applyPropertyAlias(Basic_Emoji, No) --> lots of elements",
notBasic.size() > 1000);
assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no strings",
notBasic.hasStrings());
assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no bicycle",
notBasic.contains("🚲"));
}
// The complement() API behavior does not change under this ticket.
{
UnicodeSet notBasic = new UnicodeSet("[:Basic_Emoji:]").complement();
assertTrue("[:Basic_Emoji:].complement() --> lots of elements", notBasic.size() > 1000);
assertTrue("[:Basic_Emoji:].complement() --> has strings", notBasic.hasStrings());
assertTrue("[:Basic_Emoji:].complement().contains(chipmunk+emoji)",
notBasic.contains("🐿\uFE0F"));
assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains("🚲"));
}
}
}