mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 05:55:35 +00:00
ICU-21524 UnicodeSet.hasStrings(): no complement in toPattern()
This commit is contained in:
parent
3a601a80a3
commit
b303de1ff1
5 changed files with 46 additions and 5 deletions
|
@ -2095,7 +2095,10 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
|
|||
// getRangeEnd(last) == MAX_VALUE)
|
||||
// Invariant: list[len-1] == HIGH == MAX_VALUE + 1
|
||||
// If limit == len then len is even and the last range ends with MAX_VALUE.
|
||||
if (len >= 4 && list[0] == 0 && limit == len) {
|
||||
//
|
||||
// *But* do not write the inverse (complement) if there are strings.
|
||||
// Since ICU 70, the '^' performs a code point complement which removes all strings.
|
||||
if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
|
||||
// Emit the inverse
|
||||
result.append(u'^');
|
||||
// Offsetting the inversion list index by one lets us
|
||||
|
|
|
@ -137,7 +137,7 @@ static void TestAPI() {
|
|||
uset_removeString(set, STR_ab, STR_ab_LEN);
|
||||
expect(set, "acd{bc}", "bfg{ab}", NULL);
|
||||
|
||||
/* [^acd{bc}] */
|
||||
/* [[^acd]{bc}] */
|
||||
uset_complement(set);
|
||||
expect(set, "bef{bc}", "acd{ac}", NULL);
|
||||
|
||||
|
@ -436,8 +436,8 @@ static void expectItems(const USet* set,
|
|||
strlen(items)==0 ? "TRUE" : "FALSE");
|
||||
}
|
||||
|
||||
/* Don't test patterns starting with "[^" */
|
||||
if (u_strlen(ustr) > 2 && ustr[1] == 0x5e /*'^'*/) {
|
||||
/* Don't test patterns starting with "[^" or "[\\u0000". */
|
||||
if ((u_strlen(ustr) > 2 && ustr[1] == u'^') || uset_contains(set, 0)) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -4147,6 +4147,24 @@ void UnicodeSetTest::TestPatternCodePointComplement() {
|
|||
notBasic.contains(U'🚲'));
|
||||
}
|
||||
|
||||
// When there are strings, we must not use the complement for a more compact toPattern().
|
||||
{
|
||||
UnicodeSet set;
|
||||
set.add(0, u'Y').add(u'b', u'q').add(u'x', 0x10ffff);
|
||||
UnicodeString pattern;
|
||||
set.toPattern(pattern, true);
|
||||
UnicodeSet set2(pattern, errorCode);
|
||||
checkEqual(set, set2, "set(with 0 & max, only code points) pattern round-trip");
|
||||
assertEquals("set(with 0 & max, only code points).toPattern()", u"[^Z-ar-w]", pattern);
|
||||
|
||||
set.add("ch").add("ss");
|
||||
set.toPattern(pattern, true);
|
||||
set2 = UnicodeSet(pattern, errorCode);
|
||||
checkEqual(set, set2, "set(with 0 & max, with strings) pattern round-trip");
|
||||
assertEquals("set(with 0 & max, with strings).toPattern()",
|
||||
u"[\\u0000-Yb-qx-\\U0010FFFF{ch}{ss}]", pattern);
|
||||
}
|
||||
|
||||
// The complement() API behavior does not change under this ticket.
|
||||
{
|
||||
UnicodeSet notBasic(u"[:Basic_Emoji:]", errorCode);
|
||||
|
|
|
@ -818,7 +818,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
// getRangeEnd(last) == MAX_VALUE)
|
||||
// Invariant: list[len-1] == HIGH == MAX_VALUE + 1
|
||||
// If limit == len then len is even and the last range ends with MAX_VALUE.
|
||||
if (len >= 4 && list[0] == 0 && limit == len) {
|
||||
//
|
||||
// *But* do not write the inverse (complement) if there are strings.
|
||||
// Since ICU 70, the '^' performs a code point complement which removes all strings.
|
||||
if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
|
||||
// Emit the inverse
|
||||
result.append('^');
|
||||
// Offsetting the inversion list index by one lets us
|
||||
|
|
|
@ -2971,6 +2971,23 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
notBasic.contains("🚲"));
|
||||
}
|
||||
|
||||
// When there are strings, we must not use the complement for a more compact toPattern().
|
||||
{
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
set.add(0, 'Y').add('b', 'q').add('x', 0x10ffff);
|
||||
String pattern = set.toPattern(true);
|
||||
UnicodeSet set2 = new UnicodeSet(pattern);
|
||||
checkEqual(set, set2, "set(with 0 & max, only code points) pattern round-trip");
|
||||
assertEquals("set(with 0 & max, only code points).toPattern()", "[^Z-ar-w]", pattern);
|
||||
|
||||
set.add("ch").add("ss");
|
||||
pattern = set.toPattern(true);
|
||||
set2 = new UnicodeSet(pattern);
|
||||
checkEqual(set, set2, "set(with 0 & max, with strings) pattern round-trip");
|
||||
assertEquals("set(with 0 & max, with strings).toPattern()",
|
||||
"[\\u0000-Yb-qx-\\U0010FFFF{ch}{ss}]", pattern);
|
||||
}
|
||||
|
||||
// The complement() API behavior does not change under this ticket.
|
||||
{
|
||||
UnicodeSet notBasic = new UnicodeSet("[:Basic_Emoji:]").complement();
|
||||
|
|
Loading…
Add table
Reference in a new issue