ICU-21524 UnicodeSet pattern parser: code point complement

2025-04-14 17:24:01 +00:00 · 2021-09-14 13:04:27 -07:00 · 2021-09-14 13:04:27 -07:00 · f026e967f6
commit f026e967f6
parent 7bc2009f7f
8 changed files with 186 additions and 20 deletions
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@ -136,6 +136,13 @@ class RuleCharacterIterator;
 * their delimiters; "[:^foo]" and "\\P{foo}".  In any other location,
 * '^' has no special meaning.
 *
+ * <p>Since ICU 70, "[^...]", "[:^foo]", "\\P{foo}", and "[:binaryProperty=No:]"
+ * perform a “code point complement” (all code points minus the original set),
+ * removing all multicharacter strings,
+ * equivalent to <code>.complement().removeAllStrings()</code>.
+ * The complement() API function continues to perform a
+ * symmetric difference with all code points and thus retains all multicharacter strings.
+ *
 * <p>Ranges are indicated by placing two a '-' between two
 * characters, as in "a-z".  This specifies the range of all
 * characters from the left to the right, in Unicode order.  If the
@ -1275,13 +1282,18 @@ public:
    UnicodeSet& remove(const UnicodeString& s);

    /**
-     * Inverts this set.  This operation modifies this set so that
-     * its value is its complement.  This is equivalent to
+     * This is equivalent to
     * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
+     *
+     * <strong>Note:</strong> This performs a symmetric difference with all code points
+     * <em>and thus retains all multicharacter strings</em>.
+     * In order to achieve a “code point complement” (all code points minus this set),
+     * the easiest is to <code>.complement().removeAllStrings()</code>.
+     *
     * A frozen set will not be modified.
     * @stable ICU 2.0
     */
-    virtual UnicodeSet& complement(void);
+    virtual UnicodeSet& complement();

    /**
     * Complements the specified range in this set.  Any character in
--- a/icu4c/source/common/unicode/uset.h
+++ b/icu4c/source/common/unicode/uset.h
@ -726,9 +726,14 @@ U_CAPI void U_EXPORT2
 uset_compact(USet* set);

 /**
- * Inverts this set.  This operation modifies this set so that
- * its value is its complement.  This operation does not affect
- * the multicharacter strings, if any.
+ * This is equivalent to
+ * <code>uset_complementRange(set, 0, 0x10FFFF)</code>.
+ *
+ * <strong>Note:</strong> This performs a symmetric difference with all code points
+ * <em>and thus retains all multicharacter strings</em>.
+ * In order to achieve a “code point complement” (all code points minus this set),
+ * the easiest is to <code>uset_complement(set); uset_removeAllStrings(set);</code>.
+ *
 * A frozen set will not be modified.
 * @param set the set
 * @stable ICU 2.4
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@ -638,7 +638,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
        (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
    }
    if (invert) {
-        complement();
+        complement().removeAllStrings();  // code point complement
    }

    // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
@ -791,7 +791,7 @@ UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec)
            if (U_FAILURE(ec)) { return *this; }
            copyFrom(*UnicodeSet::fromUSet(set), TRUE);
            if (value == 0) {
-                complement();
+                complement().removeAllStrings();  // code point complement
            }
        } else {
            clear();
@ -958,7 +958,7 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,

    applyIntPropertyValue(p, v, ec);
    if(invert) {
-        complement();
+        complement().removeAllStrings();  // code point complement
    }

    if (isBogus() && U_SUCCESS(ec)) {
@ -1101,9 +1101,9 @@ UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,

    if (U_SUCCESS(ec)) {
        if (invert) {
-            complement();
+            complement().removeAllStrings();  // code point complement
        }
-            
+
        // Move to the limit position after the close delimiter if the
        // parse succeeded.
        ppos.setIndex(close + (posix ? 2 : 1));
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -4580,6 +4580,13 @@ UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB
    } while (false);   // End of do loop block. Code above breaks out of the block on success or hard failure.

    if (U_SUCCESS(status)) {
+        // ICU 70 adds emoji properties of strings, but as long as Java does not say how to
+        // deal with properties of strings and character classes with strings, we ignore them.
+        // Just in case something downstream might stumble over the strings,
+        // we remove them from the set.
+        // Note that when we support strings, the complement of a property (as with \P)
+        // should be implemented as .complement().removeAllStrings() (code point complement).
+        set->removeAllStrings();
        U_ASSERT(set.isValid());
        if (negated) {
            set->complement();
@ -4613,6 +4620,13 @@ void RegexCompile::setEval(int32_t nextOp) {
        fSetOpStack.popi();
        U_ASSERT(fSetStack.empty() == FALSE);
        rightOperand = (UnicodeSet *)fSetStack.peek();
+        // ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings
+        // (see comments there).
+        // We also do not yet support string literals in character classes,
+        // so there should not be any strings.
+        // Note that when we support strings, the complement of a set (as with ^ or \P)
+        // should be implemented as .complement().removeAllStrings() (code point complement).
+        U_ASSERT(!rightOperand->hasStrings());
        switch (pendingSetOperation) {
            case setNegation:
                rightOperand->complement();
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@ -100,6 +100,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
    TESTCASE_AUTO(TestDeepPattern);
    TESTCASE_AUTO(TestEmptyString);
    TESTCASE_AUTO(TestSkipToStrings);
+    TESTCASE_AUTO(TestPatternCodePointComplement);
    TESTCASE_AUTO_END;
 }

@ -4095,3 +4096,65 @@ void UnicodeSetTest::TestSkipToStrings() {
    assertNext(iter, u"ch");
    assertFalse("no next", iter.next());
 }
+
+void UnicodeSetTest::TestPatternCodePointComplement() {
+    IcuTestErrorCode errorCode(*this, "TestPatternCodePointComplement");
+    // ICU-21524 changes pattern ^ and equivalent functions to perform a "code point complement".
+    // [^abc{ch}] = [[:Any:]-[abc{ch}]] which removes all strings.
+    {
+        UnicodeSet simple(u"[^abc{ch}]", errorCode);
+        assertEquals("[^abc{ch}] --> lots of elements", 0x110000 - 3, simple.size());
+        assertFalse("[^abc{ch}] --> no strings", simple.hasStrings());
+        assertFalse("[^abc{ch}] --> no 'a'", simple.contains(u'a'));
+    }
+
+    {
+        UnicodeSet notBasic(u"[:^Basic_Emoji:]", errorCode);
+        if (errorCode.errDataIfFailureAndReset("[:^Basic_Emoji:]")) {
+            return;
+        }
+        assertTrue("[:^Basic_Emoji:] --> lots of elements", notBasic.size() > 1000);
+        assertFalse("[:^Basic_Emoji:] --> no strings", notBasic.hasStrings());
+        assertFalse("[:^Basic_Emoji:] --> no bicycle", notBasic.contains(U'🚲'));
+    }
+
+    {
+        UnicodeSet notBasic(u"[:Basic_Emoji=No:]", errorCode);
+        assertTrue("[:Basic_Emoji=No:] --> lots of elements", notBasic.size() > 1000);
+        assertFalse("[:Basic_Emoji=No:] --> no strings", notBasic.hasStrings());
+        assertFalse("[:Basic_Emoji=No:] --> no bicycle", notBasic.contains(U'🚲'));
+    }
+
+    {
+        UnicodeSet notBasic;
+        notBasic.applyIntPropertyValue(UCHAR_BASIC_EMOJI, 0, errorCode);
+        assertTrue("[].applyIntPropertyValue(Basic_Emoji, 0) --> lots of elements",
+                notBasic.size() > 1000);
+        assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no strings",
+                notBasic.hasStrings());
+        assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no bicycle",
+                notBasic.contains(U'🚲'));
+    }
+
+    {
+        UnicodeSet notBasic;
+        notBasic.applyPropertyAlias("Basic_Emoji", "No", errorCode);
+        assertTrue("[].applyPropertyAlias(Basic_Emoji, No) --> lots of elements",
+                notBasic.size() > 1000);
+        assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no strings",
+                notBasic.hasStrings());
+        assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no bicycle",
+                notBasic.contains(U'🚲'));
+    }
+
+    // The complement() API behavior does not change under this ticket.
+    {
+        UnicodeSet notBasic(u"[:Basic_Emoji:]", errorCode);
+        notBasic.complement();
+        assertTrue("[:Basic_Emoji:].complement() --> lots of elements", notBasic.size() > 1000);
+        assertTrue("[:Basic_Emoji:].complement() --> has strings", notBasic.hasStrings());
+        assertTrue("[:Basic_Emoji:].complement().contains(chipmunk+emoji)",
+                notBasic.contains(u"🐿\uFE0F"));
+        assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains(U'🚲'));
+    }
+}
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@ -99,6 +99,7 @@ private:

    void assertNext(UnicodeSetIterator &iter, const UnicodeString &expected);
    void TestSkipToStrings();
+    void TestPatternCodePointComplement();

 private:

--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@ -144,6 +144,13 @@ import com.ibm.icu.util.VersionInfo;
 * their delimiters; "[:^foo]" and "\P{foo}".  In any other location,
 * '^' has no special meaning.
 *
+ * <p>Since ICU 70, "[^...]", "[:^foo]", "\P{foo}", and "[:binaryProperty=No:]"
+ * perform a “code point complement” (all code points minus the original set),
+ * removing all multicharacter strings,
+ * equivalent to .{@link #complement()}.{@link #removeAllStrings()} .
+ * The {@link #complement()} API function continues to perform a
+ * symmetric difference with all code points and thus retains all multicharacter strings.
+ *
 * <p>Ranges are indicated by placing two a '-' between two
 * characters, as in "a-z".  This specifies the range of all
 * characters from the left to the right, in Unicode order.  If the
@ -1689,6 +1696,12 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
    /**
     * This is equivalent to
     * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
+     *
+     * <p><strong>Note:</strong> This performs a symmetric difference with all code points
+     * <em>and thus retains all multicharacter strings</em>.
+     * In order to achieve a “code point complement” (all code points minus this set),
+     * the easiest is to .{@link #complement()}.{@link #removeAllStrings()} .
+     *
     * @stable ICU 2.0
     */
    public UnicodeSet complement() {
@ -2953,7 +2966,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
            closeOver(CASE);
        }
        if (invert) {
-            complement();
+            complement().removeAllStrings();  // code point complement
        }

        // Use the rebuilt pattern (pat) only if necessary.  Prefer the
@ -3474,7 +3487,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
            if (value == 0 || value == 1) {
                set(CharacterProperties.getBinaryPropertySet(prop));
                if (value == 0) {
-                    complement();
+                    complement().removeAllStrings();  // code point complement
                }
            } else {
                clear();
@ -3670,7 +3683,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa

        applyIntPropertyValue(p, v);
        if(invert) {
-            complement();
+            complement().removeAllStrings();  // code point complement
        }

        return this;
@ -3798,7 +3811,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
        applyPropertyAlias(propName, valueName, symbols);

        if (invert) {
-            complement();
+            complement().removeAllStrings();  // code point complement
        }

        // Move to the limit position after the close delimiter
@ -4768,9 +4781,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
     */
    @Deprecated
    public UnicodeSet addBridges(UnicodeSet dontCare) {
-        UnicodeSet notInInput = new UnicodeSet(this).complement();
+        UnicodeSet notInInput = new UnicodeSet(this).complement().removeAllStrings();
        for (UnicodeSetIterator it = new UnicodeSetIterator(notInInput); it.nextRange();) {
-            if (it.codepoint != 0 && it.codepoint != UnicodeSetIterator.IS_STRING && it.codepointEnd != 0x10FFFF && dontCare.contains(it.codepoint,it.codepointEnd)) {
+            if (it.codepoint != 0 && it.codepointEnd != 0x10FFFF &&
+                    dontCare.contains(it.codepoint, it.codepointEnd)) {
                add(it.codepoint,it.codepointEnd);
            }
        }
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
@ -153,12 +153,11 @@ public class UnicodeSetTest extends TestFmwk {
                    UnicodeSet collectedErrors = new UnicodeSet();
                    for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.next();) {
                        if (it.codepoint == UnicodeSetIterator.IS_STRING) {
-                            // For binary properties of strings, only [:propName=true:] _should_ yield strings.
+                            // For binary properties of strings, only [:propName=true:] yields strings.
                            // Therefore, we should always have valueNum=1 and b=true.
-                            // TODO: ICU-21524 ^ and propName=N use complement() which leaves strings alone.
                            boolean b = UCharacter.hasBinaryProperty(it.string, propNum);
                            int value = b ? 1 : 0;
-                            if (value != valueNum && /* TODO: ICU-21524 */ valueNum != 0) {
+                            if (value != valueNum) {
                                collectedErrors.add(it.string);
                            }
                        } else {
@ -2924,4 +2923,62 @@ public class UnicodeSetTest extends TestFmwk {
        assertNext(iter, "ch");
        assertFalse("no next", iter.next());
    }
+
+    @Test
+    public void TestPatternCodePointComplement() {
+        // ICU-21524 changes pattern ^ and equivalent functions to perform a "code point complement".
+        // [^abc{ch}] = [[:Any:]-[abc{ch}]] which removes all strings.
+        {
+            UnicodeSet simple = new UnicodeSet("[^abc{ch}]");
+            assertEquals("[^abc{ch}] --> lots of elements", 0x110000 - 3, simple.size());
+            assertFalse("[^abc{ch}] --> no strings", simple.hasStrings());
+            assertFalse("[^abc{ch}] --> no 'a'", simple.contains('a'));
+        }
+
+        {
+            UnicodeSet notBasic = new UnicodeSet("[:^Basic_Emoji:]");
+            assertTrue("[:^Basic_Emoji:] --> lots of elements", notBasic.size() > 1000);
+            assertFalse("[:^Basic_Emoji:] --> no strings", notBasic.hasStrings());
+            assertFalse("[:^Basic_Emoji:] --> no bicycle", notBasic.contains("🚲"));
+        }
+
+        {
+            UnicodeSet notBasic = new UnicodeSet("[:Basic_Emoji=No:]");
+            assertTrue("[:Basic_Emoji=No:] --> lots of elements", notBasic.size() > 1000);
+            assertFalse("[:Basic_Emoji=No:] --> no strings", notBasic.hasStrings());
+            assertFalse("[:Basic_Emoji=No:] --> no bicycle", notBasic.contains("🚲"));
+        }
+
+        {
+            UnicodeSet notBasic = new UnicodeSet();
+            notBasic.applyIntPropertyValue(UProperty.BASIC_EMOJI, 0);
+            assertTrue("[].applyIntPropertyValue(Basic_Emoji, 0) --> lots of elements",
+                    notBasic.size() > 1000);
+            assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no strings",
+                    notBasic.hasStrings());
+            assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no bicycle",
+                    notBasic.contains("🚲"));
+        }
+
+        {
+            UnicodeSet notBasic = new UnicodeSet();
+            notBasic.applyPropertyAlias("Basic_Emoji", "No");
+            assertTrue("[].applyPropertyAlias(Basic_Emoji, No) --> lots of elements",
+                    notBasic.size() > 1000);
+            assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no strings",
+                    notBasic.hasStrings());
+            assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no bicycle",
+                    notBasic.contains("🚲"));
+        }
+
+        // The complement() API behavior does not change under this ticket.
+        {
+            UnicodeSet notBasic = new UnicodeSet("[:Basic_Emoji:]").complement();
+            assertTrue("[:Basic_Emoji:].complement() --> lots of elements", notBasic.size() > 1000);
+            assertTrue("[:Basic_Emoji:].complement() --> has strings", notBasic.hasStrings());
+            assertTrue("[:Basic_Emoji:].complement().contains(chipmunk+emoji)",
+                    notBasic.contains("🐿\uFE0F"));
+            assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains("🚲"));
+        }
+    }
 }