mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 17:24:01 +00:00
ICU-21524 UnicodeSet pattern parser: code point complement
This commit is contained in:
parent
7bc2009f7f
commit
f026e967f6
8 changed files with 186 additions and 20 deletions
|
@ -136,6 +136,13 @@ class RuleCharacterIterator;
|
|||
* their delimiters; "[:^foo]" and "\\P{foo}". In any other location,
|
||||
* '^' has no special meaning.
|
||||
*
|
||||
* <p>Since ICU 70, "[^...]", "[:^foo]", "\\P{foo}", and "[:binaryProperty=No:]"
|
||||
* perform a “code point complement” (all code points minus the original set),
|
||||
* removing all multicharacter strings,
|
||||
* equivalent to <code>.complement().removeAllStrings()</code>.
|
||||
* The complement() API function continues to perform a
|
||||
* symmetric difference with all code points and thus retains all multicharacter strings.
|
||||
*
|
||||
* <p>Ranges are indicated by placing two a '-' between two
|
||||
* characters, as in "a-z". This specifies the range of all
|
||||
* characters from the left to the right, in Unicode order. If the
|
||||
|
@ -1275,13 +1282,18 @@ public:
|
|||
UnicodeSet& remove(const UnicodeString& s);
|
||||
|
||||
/**
|
||||
* Inverts this set. This operation modifies this set so that
|
||||
* its value is its complement. This is equivalent to
|
||||
* This is equivalent to
|
||||
* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
|
||||
*
|
||||
* <strong>Note:</strong> This performs a symmetric difference with all code points
|
||||
* <em>and thus retains all multicharacter strings</em>.
|
||||
* In order to achieve a “code point complement” (all code points minus this set),
|
||||
* the easiest is to <code>.complement().removeAllStrings()</code>.
|
||||
*
|
||||
* A frozen set will not be modified.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
virtual UnicodeSet& complement(void);
|
||||
virtual UnicodeSet& complement();
|
||||
|
||||
/**
|
||||
* Complements the specified range in this set. Any character in
|
||||
|
|
|
@ -726,9 +726,14 @@ U_CAPI void U_EXPORT2
|
|||
uset_compact(USet* set);
|
||||
|
||||
/**
|
||||
* Inverts this set. This operation modifies this set so that
|
||||
* its value is its complement. This operation does not affect
|
||||
* the multicharacter strings, if any.
|
||||
* This is equivalent to
|
||||
* <code>uset_complementRange(set, 0, 0x10FFFF)</code>.
|
||||
*
|
||||
* <strong>Note:</strong> This performs a symmetric difference with all code points
|
||||
* <em>and thus retains all multicharacter strings</em>.
|
||||
* In order to achieve a “code point complement” (all code points minus this set),
|
||||
* the easiest is to <code>uset_complement(set); uset_removeAllStrings(set);</code>.
|
||||
*
|
||||
* A frozen set will not be modified.
|
||||
* @param set the set
|
||||
* @stable ICU 2.4
|
||||
|
|
|
@ -638,7 +638,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
|
|||
(this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
|
||||
}
|
||||
if (invert) {
|
||||
complement();
|
||||
complement().removeAllStrings(); // code point complement
|
||||
}
|
||||
|
||||
// Use the rebuilt pattern (patLocal) only if necessary. Prefer the
|
||||
|
@ -791,7 +791,7 @@ UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec)
|
|||
if (U_FAILURE(ec)) { return *this; }
|
||||
copyFrom(*UnicodeSet::fromUSet(set), TRUE);
|
||||
if (value == 0) {
|
||||
complement();
|
||||
complement().removeAllStrings(); // code point complement
|
||||
}
|
||||
} else {
|
||||
clear();
|
||||
|
@ -958,7 +958,7 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
|
|||
|
||||
applyIntPropertyValue(p, v, ec);
|
||||
if(invert) {
|
||||
complement();
|
||||
complement().removeAllStrings(); // code point complement
|
||||
}
|
||||
|
||||
if (isBogus() && U_SUCCESS(ec)) {
|
||||
|
@ -1101,9 +1101,9 @@ UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
|
|||
|
||||
if (U_SUCCESS(ec)) {
|
||||
if (invert) {
|
||||
complement();
|
||||
complement().removeAllStrings(); // code point complement
|
||||
}
|
||||
|
||||
|
||||
// Move to the limit position after the close delimiter if the
|
||||
// parse succeeded.
|
||||
ppos.setIndex(close + (posix ? 2 : 1));
|
||||
|
|
|
@ -4580,6 +4580,13 @@ UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB
|
|||
} while (false); // End of do loop block. Code above breaks out of the block on success or hard failure.
|
||||
|
||||
if (U_SUCCESS(status)) {
|
||||
// ICU 70 adds emoji properties of strings, but as long as Java does not say how to
|
||||
// deal with properties of strings and character classes with strings, we ignore them.
|
||||
// Just in case something downstream might stumble over the strings,
|
||||
// we remove them from the set.
|
||||
// Note that when we support strings, the complement of a property (as with \P)
|
||||
// should be implemented as .complement().removeAllStrings() (code point complement).
|
||||
set->removeAllStrings();
|
||||
U_ASSERT(set.isValid());
|
||||
if (negated) {
|
||||
set->complement();
|
||||
|
@ -4613,6 +4620,13 @@ void RegexCompile::setEval(int32_t nextOp) {
|
|||
fSetOpStack.popi();
|
||||
U_ASSERT(fSetStack.empty() == FALSE);
|
||||
rightOperand = (UnicodeSet *)fSetStack.peek();
|
||||
// ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings
|
||||
// (see comments there).
|
||||
// We also do not yet support string literals in character classes,
|
||||
// so there should not be any strings.
|
||||
// Note that when we support strings, the complement of a set (as with ^ or \P)
|
||||
// should be implemented as .complement().removeAllStrings() (code point complement).
|
||||
U_ASSERT(!rightOperand->hasStrings());
|
||||
switch (pendingSetOperation) {
|
||||
case setNegation:
|
||||
rightOperand->complement();
|
||||
|
|
|
@ -100,6 +100,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
|
|||
TESTCASE_AUTO(TestDeepPattern);
|
||||
TESTCASE_AUTO(TestEmptyString);
|
||||
TESTCASE_AUTO(TestSkipToStrings);
|
||||
TESTCASE_AUTO(TestPatternCodePointComplement);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
@ -4095,3 +4096,65 @@ void UnicodeSetTest::TestSkipToStrings() {
|
|||
assertNext(iter, u"ch");
|
||||
assertFalse("no next", iter.next());
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestPatternCodePointComplement() {
|
||||
IcuTestErrorCode errorCode(*this, "TestPatternCodePointComplement");
|
||||
// ICU-21524 changes pattern ^ and equivalent functions to perform a "code point complement".
|
||||
// [^abc{ch}] = [[:Any:]-[abc{ch}]] which removes all strings.
|
||||
{
|
||||
UnicodeSet simple(u"[^abc{ch}]", errorCode);
|
||||
assertEquals("[^abc{ch}] --> lots of elements", 0x110000 - 3, simple.size());
|
||||
assertFalse("[^abc{ch}] --> no strings", simple.hasStrings());
|
||||
assertFalse("[^abc{ch}] --> no 'a'", simple.contains(u'a'));
|
||||
}
|
||||
|
||||
{
|
||||
UnicodeSet notBasic(u"[:^Basic_Emoji:]", errorCode);
|
||||
if (errorCode.errDataIfFailureAndReset("[:^Basic_Emoji:]")) {
|
||||
return;
|
||||
}
|
||||
assertTrue("[:^Basic_Emoji:] --> lots of elements", notBasic.size() > 1000);
|
||||
assertFalse("[:^Basic_Emoji:] --> no strings", notBasic.hasStrings());
|
||||
assertFalse("[:^Basic_Emoji:] --> no bicycle", notBasic.contains(U'🚲'));
|
||||
}
|
||||
|
||||
{
|
||||
UnicodeSet notBasic(u"[:Basic_Emoji=No:]", errorCode);
|
||||
assertTrue("[:Basic_Emoji=No:] --> lots of elements", notBasic.size() > 1000);
|
||||
assertFalse("[:Basic_Emoji=No:] --> no strings", notBasic.hasStrings());
|
||||
assertFalse("[:Basic_Emoji=No:] --> no bicycle", notBasic.contains(U'🚲'));
|
||||
}
|
||||
|
||||
{
|
||||
UnicodeSet notBasic;
|
||||
notBasic.applyIntPropertyValue(UCHAR_BASIC_EMOJI, 0, errorCode);
|
||||
assertTrue("[].applyIntPropertyValue(Basic_Emoji, 0) --> lots of elements",
|
||||
notBasic.size() > 1000);
|
||||
assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no strings",
|
||||
notBasic.hasStrings());
|
||||
assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no bicycle",
|
||||
notBasic.contains(U'🚲'));
|
||||
}
|
||||
|
||||
{
|
||||
UnicodeSet notBasic;
|
||||
notBasic.applyPropertyAlias("Basic_Emoji", "No", errorCode);
|
||||
assertTrue("[].applyPropertyAlias(Basic_Emoji, No) --> lots of elements",
|
||||
notBasic.size() > 1000);
|
||||
assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no strings",
|
||||
notBasic.hasStrings());
|
||||
assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no bicycle",
|
||||
notBasic.contains(U'🚲'));
|
||||
}
|
||||
|
||||
// The complement() API behavior does not change under this ticket.
|
||||
{
|
||||
UnicodeSet notBasic(u"[:Basic_Emoji:]", errorCode);
|
||||
notBasic.complement();
|
||||
assertTrue("[:Basic_Emoji:].complement() --> lots of elements", notBasic.size() > 1000);
|
||||
assertTrue("[:Basic_Emoji:].complement() --> has strings", notBasic.hasStrings());
|
||||
assertTrue("[:Basic_Emoji:].complement().contains(chipmunk+emoji)",
|
||||
notBasic.contains(u"🐿\uFE0F"));
|
||||
assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains(U'🚲'));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -99,6 +99,7 @@ private:
|
|||
|
||||
void assertNext(UnicodeSetIterator &iter, const UnicodeString &expected);
|
||||
void TestSkipToStrings();
|
||||
void TestPatternCodePointComplement();
|
||||
|
||||
private:
|
||||
|
||||
|
|
|
@ -144,6 +144,13 @@ import com.ibm.icu.util.VersionInfo;
|
|||
* their delimiters; "[:^foo]" and "\P{foo}". In any other location,
|
||||
* '^' has no special meaning.
|
||||
*
|
||||
* <p>Since ICU 70, "[^...]", "[:^foo]", "\P{foo}", and "[:binaryProperty=No:]"
|
||||
* perform a “code point complement” (all code points minus the original set),
|
||||
* removing all multicharacter strings,
|
||||
* equivalent to .{@link #complement()}.{@link #removeAllStrings()} .
|
||||
* The {@link #complement()} API function continues to perform a
|
||||
* symmetric difference with all code points and thus retains all multicharacter strings.
|
||||
*
|
||||
* <p>Ranges are indicated by placing two a '-' between two
|
||||
* characters, as in "a-z". This specifies the range of all
|
||||
* characters from the left to the right, in Unicode order. If the
|
||||
|
@ -1689,6 +1696,12 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
/**
|
||||
* This is equivalent to
|
||||
* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
|
||||
*
|
||||
* <p><strong>Note:</strong> This performs a symmetric difference with all code points
|
||||
* <em>and thus retains all multicharacter strings</em>.
|
||||
* In order to achieve a “code point complement” (all code points minus this set),
|
||||
* the easiest is to .{@link #complement()}.{@link #removeAllStrings()} .
|
||||
*
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public UnicodeSet complement() {
|
||||
|
@ -2953,7 +2966,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
closeOver(CASE);
|
||||
}
|
||||
if (invert) {
|
||||
complement();
|
||||
complement().removeAllStrings(); // code point complement
|
||||
}
|
||||
|
||||
// Use the rebuilt pattern (pat) only if necessary. Prefer the
|
||||
|
@ -3474,7 +3487,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
if (value == 0 || value == 1) {
|
||||
set(CharacterProperties.getBinaryPropertySet(prop));
|
||||
if (value == 0) {
|
||||
complement();
|
||||
complement().removeAllStrings(); // code point complement
|
||||
}
|
||||
} else {
|
||||
clear();
|
||||
|
@ -3670,7 +3683,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
|
||||
applyIntPropertyValue(p, v);
|
||||
if(invert) {
|
||||
complement();
|
||||
complement().removeAllStrings(); // code point complement
|
||||
}
|
||||
|
||||
return this;
|
||||
|
@ -3798,7 +3811,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
applyPropertyAlias(propName, valueName, symbols);
|
||||
|
||||
if (invert) {
|
||||
complement();
|
||||
complement().removeAllStrings(); // code point complement
|
||||
}
|
||||
|
||||
// Move to the limit position after the close delimiter
|
||||
|
@ -4768,9 +4781,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
*/
|
||||
@Deprecated
|
||||
public UnicodeSet addBridges(UnicodeSet dontCare) {
|
||||
UnicodeSet notInInput = new UnicodeSet(this).complement();
|
||||
UnicodeSet notInInput = new UnicodeSet(this).complement().removeAllStrings();
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(notInInput); it.nextRange();) {
|
||||
if (it.codepoint != 0 && it.codepoint != UnicodeSetIterator.IS_STRING && it.codepointEnd != 0x10FFFF && dontCare.contains(it.codepoint,it.codepointEnd)) {
|
||||
if (it.codepoint != 0 && it.codepointEnd != 0x10FFFF &&
|
||||
dontCare.contains(it.codepoint, it.codepointEnd)) {
|
||||
add(it.codepoint,it.codepointEnd);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -153,12 +153,11 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
UnicodeSet collectedErrors = new UnicodeSet();
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.next();) {
|
||||
if (it.codepoint == UnicodeSetIterator.IS_STRING) {
|
||||
// For binary properties of strings, only [:propName=true:] _should_ yield strings.
|
||||
// For binary properties of strings, only [:propName=true:] yields strings.
|
||||
// Therefore, we should always have valueNum=1 and b=true.
|
||||
// TODO: ICU-21524 ^ and propName=N use complement() which leaves strings alone.
|
||||
boolean b = UCharacter.hasBinaryProperty(it.string, propNum);
|
||||
int value = b ? 1 : 0;
|
||||
if (value != valueNum && /* TODO: ICU-21524 */ valueNum != 0) {
|
||||
if (value != valueNum) {
|
||||
collectedErrors.add(it.string);
|
||||
}
|
||||
} else {
|
||||
|
@ -2924,4 +2923,62 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
assertNext(iter, "ch");
|
||||
assertFalse("no next", iter.next());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestPatternCodePointComplement() {
|
||||
// ICU-21524 changes pattern ^ and equivalent functions to perform a "code point complement".
|
||||
// [^abc{ch}] = [[:Any:]-[abc{ch}]] which removes all strings.
|
||||
{
|
||||
UnicodeSet simple = new UnicodeSet("[^abc{ch}]");
|
||||
assertEquals("[^abc{ch}] --> lots of elements", 0x110000 - 3, simple.size());
|
||||
assertFalse("[^abc{ch}] --> no strings", simple.hasStrings());
|
||||
assertFalse("[^abc{ch}] --> no 'a'", simple.contains('a'));
|
||||
}
|
||||
|
||||
{
|
||||
UnicodeSet notBasic = new UnicodeSet("[:^Basic_Emoji:]");
|
||||
assertTrue("[:^Basic_Emoji:] --> lots of elements", notBasic.size() > 1000);
|
||||
assertFalse("[:^Basic_Emoji:] --> no strings", notBasic.hasStrings());
|
||||
assertFalse("[:^Basic_Emoji:] --> no bicycle", notBasic.contains("🚲"));
|
||||
}
|
||||
|
||||
{
|
||||
UnicodeSet notBasic = new UnicodeSet("[:Basic_Emoji=No:]");
|
||||
assertTrue("[:Basic_Emoji=No:] --> lots of elements", notBasic.size() > 1000);
|
||||
assertFalse("[:Basic_Emoji=No:] --> no strings", notBasic.hasStrings());
|
||||
assertFalse("[:Basic_Emoji=No:] --> no bicycle", notBasic.contains("🚲"));
|
||||
}
|
||||
|
||||
{
|
||||
UnicodeSet notBasic = new UnicodeSet();
|
||||
notBasic.applyIntPropertyValue(UProperty.BASIC_EMOJI, 0);
|
||||
assertTrue("[].applyIntPropertyValue(Basic_Emoji, 0) --> lots of elements",
|
||||
notBasic.size() > 1000);
|
||||
assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no strings",
|
||||
notBasic.hasStrings());
|
||||
assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no bicycle",
|
||||
notBasic.contains("🚲"));
|
||||
}
|
||||
|
||||
{
|
||||
UnicodeSet notBasic = new UnicodeSet();
|
||||
notBasic.applyPropertyAlias("Basic_Emoji", "No");
|
||||
assertTrue("[].applyPropertyAlias(Basic_Emoji, No) --> lots of elements",
|
||||
notBasic.size() > 1000);
|
||||
assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no strings",
|
||||
notBasic.hasStrings());
|
||||
assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no bicycle",
|
||||
notBasic.contains("🚲"));
|
||||
}
|
||||
|
||||
// The complement() API behavior does not change under this ticket.
|
||||
{
|
||||
UnicodeSet notBasic = new UnicodeSet("[:Basic_Emoji:]").complement();
|
||||
assertTrue("[:Basic_Emoji:].complement() --> lots of elements", notBasic.size() > 1000);
|
||||
assertTrue("[:Basic_Emoji:].complement() --> has strings", notBasic.hasStrings());
|
||||
assertTrue("[:Basic_Emoji:].complement().contains(chipmunk+emoji)",
|
||||
notBasic.contains("🐿\uFE0F"));
|
||||
assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains("🚲"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue