From b303de1ff19024ac6d4ca9ebc358644e0d73f69f Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 18 Oct 2021 17:19:15 -0700 Subject: [PATCH] ICU-21524 UnicodeSet.hasStrings(): no complement in toPattern() --- icu4c/source/common/uniset.cpp | 5 ++++- icu4c/source/test/cintltst/usettest.c | 6 +++--- icu4c/source/test/intltest/usettest.cpp | 18 ++++++++++++++++++ .../core/src/com/ibm/icu/text/UnicodeSet.java | 5 ++++- .../ibm/icu/dev/test/lang/UnicodeSetTest.java | 17 +++++++++++++++++ 5 files changed, 46 insertions(+), 5 deletions(-) diff --git a/icu4c/source/common/uniset.cpp b/icu4c/source/common/uniset.cpp index 967ea2ecdb2..92a81a1a02d 100644 --- a/icu4c/source/common/uniset.cpp +++ b/icu4c/source/common/uniset.cpp @@ -2095,7 +2095,10 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, // getRangeEnd(last) == MAX_VALUE) // Invariant: list[len-1] == HIGH == MAX_VALUE + 1 // If limit == len then len is even and the last range ends with MAX_VALUE. - if (len >= 4 && list[0] == 0 && limit == len) { + // + // *But* do not write the inverse (complement) if there are strings. + // Since ICU 70, the '^' performs a code point complement which removes all strings. + if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) { // Emit the inverse result.append(u'^'); // Offsetting the inversion list index by one lets us diff --git a/icu4c/source/test/cintltst/usettest.c b/icu4c/source/test/cintltst/usettest.c index a444ce5c937..f5528d05801 100644 --- a/icu4c/source/test/cintltst/usettest.c +++ b/icu4c/source/test/cintltst/usettest.c @@ -137,7 +137,7 @@ static void TestAPI() { uset_removeString(set, STR_ab, STR_ab_LEN); expect(set, "acd{bc}", "bfg{ab}", NULL); - /* [^acd{bc}] */ + /* [[^acd]{bc}] */ uset_complement(set); expect(set, "bef{bc}", "acd{ac}", NULL); @@ -436,8 +436,8 @@ static void expectItems(const USet* set, strlen(items)==0 ? "TRUE" : "FALSE"); } - /* Don't test patterns starting with "[^" */ - if (u_strlen(ustr) > 2 && ustr[1] == 0x5e /*'^'*/) { + /* Don't test patterns starting with "[^" or "[\\u0000". */ + if ((u_strlen(ustr) > 2 && ustr[1] == u'^') || uset_contains(set, 0)) { return; } diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 7a0641f3a53..b4bee760107 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4147,6 +4147,24 @@ void UnicodeSetTest::TestPatternCodePointComplement() { notBasic.contains(U'🚲')); } + // When there are strings, we must not use the complement for a more compact toPattern(). + { + UnicodeSet set; + set.add(0, u'Y').add(u'b', u'q').add(u'x', 0x10ffff); + UnicodeString pattern; + set.toPattern(pattern, true); + UnicodeSet set2(pattern, errorCode); + checkEqual(set, set2, "set(with 0 & max, only code points) pattern round-trip"); + assertEquals("set(with 0 & max, only code points).toPattern()", u"[^Z-ar-w]", pattern); + + set.add("ch").add("ss"); + set.toPattern(pattern, true); + set2 = UnicodeSet(pattern, errorCode); + checkEqual(set, set2, "set(with 0 & max, with strings) pattern round-trip"); + assertEquals("set(with 0 & max, with strings).toPattern()", + u"[\\u0000-Yb-qx-\\U0010FFFF{ch}{ss}]", pattern); + } + // The complement() API behavior does not change under this ticket. { UnicodeSet notBasic(u"[:Basic_Emoji:]", errorCode); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java index e677fcaad89..d799b03aaa8 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java @@ -818,7 +818,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // getRangeEnd(last) == MAX_VALUE) // Invariant: list[len-1] == HIGH == MAX_VALUE + 1 // If limit == len then len is even and the last range ends with MAX_VALUE. - if (len >= 4 && list[0] == 0 && limit == len) { + // + // *But* do not write the inverse (complement) if there are strings. + // Since ICU 70, the '^' performs a code point complement which removes all strings. + if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) { // Emit the inverse result.append('^'); // Offsetting the inversion list index by one lets us diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java index a94b9fe7e6a..e31d92bcba4 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java @@ -2971,6 +2971,23 @@ public class UnicodeSetTest extends TestFmwk { notBasic.contains("🚲")); } + // When there are strings, we must not use the complement for a more compact toPattern(). + { + UnicodeSet set = new UnicodeSet(); + set.add(0, 'Y').add('b', 'q').add('x', 0x10ffff); + String pattern = set.toPattern(true); + UnicodeSet set2 = new UnicodeSet(pattern); + checkEqual(set, set2, "set(with 0 & max, only code points) pattern round-trip"); + assertEquals("set(with 0 & max, only code points).toPattern()", "[^Z-ar-w]", pattern); + + set.add("ch").add("ss"); + pattern = set.toPattern(true); + set2 = new UnicodeSet(pattern); + checkEqual(set, set2, "set(with 0 & max, with strings) pattern round-trip"); + assertEquals("set(with 0 & max, with strings).toPattern()", + "[\\u0000-Yb-qx-\\U0010FFFF{ch}{ss}]", pattern); + } + // The complement() API behavior does not change under this ticket. { UnicodeSet notBasic = new UnicodeSet("[:Basic_Emoji:]").complement();