ICU-11891 make UnicodeSet.toPattern() well-formed & round-trip

This commit is contained in:
Markus Scherer 2021-09-16 19:51:56 -07:00
parent 43276e8c34
commit 3f0e003901
10 changed files with 309 additions and 184 deletions

View file

@ -1597,6 +1597,9 @@ private:
static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
UBool escapeUnprintable);
//----------------------------------------------------------------
// Implementation: Fundamental operators
//----------------------------------------------------------------

View file

@ -1967,8 +1967,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity)
* Append the <code>toPattern()</code> representation of a
* string to the given <code>StringBuffer</code>.
*/
void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
escapeUnprintable) {
void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
UChar32 cp;
for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
_appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
@ -1979,14 +1978,12 @@ escapeUnprintable) {
* Append the <code>toPattern()</code> representation of a
* character to the given <code>StringBuffer</code>.
*/
void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
escapeUnprintable) {
if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
// Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
// unprintable
if (ICU_Utility::escapeUnprintable(buf, c)) {
return;
}
ICU_Utility::escape(buf, c);
return;
}
// Okay to let ':' pass through
switch (c) {
@ -2012,6 +2009,19 @@ escapeUnprintable) {
buf.append(c);
}
void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
UBool escapeUnprintable) {
_appendToPat(result, start, escapeUnprintable);
if (start != end) {
if ((start+1) != end ||
// Avoid writing what looks like a lead+trail surrogate pair.
start == 0xdbff) {
result.append(u'-');
}
_appendToPat(result, end, escapeUnprintable);
}
}
/**
* Append a string representation of this set to result. This will be
* a cleaned version of the string passed to applyPattern(), if there
@ -2026,7 +2036,8 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
for (i=0; i<patLen; ) {
UChar32 c;
U16_NEXT(pat, i, patLen, c);
if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
if (escapeUnprintable ?
ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
// If the unprintable character is preceded by an odd
// number of backslashes, then it has been escaped.
// Before unescaping it, we delete the final
@ -2034,7 +2045,7 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
if ((backslashCount % 2) == 1) {
result.truncate(result.length() - 1);
}
ICU_Utility::escapeUnprintable(result, c);
ICU_Utility::escape(result, c);
backslashCount = 0;
} else {
result.append(c);
@ -2073,52 +2084,48 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
{
result.append(u'[');
// // Check against the predefined categories. We implicitly build
// // up ALL category sets the first time toPattern() is called.
// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
// if (*this == getCategorySet(cat)) {
// result.append(u':');
// result.append(CATEGORY_NAMES, cat*2, 2);
// return result.append(CATEGORY_CLOSE);
// }
// }
int32_t count = getRangeCount();
int32_t i = 0;
int32_t limit = len & ~1; // = 2 * getRangeCount()
// If the set contains at least 2 intervals and includes both
// MIN_VALUE and MAX_VALUE, then the inverse representation will
// be more economical.
if (count > 1 &&
getRangeStart(0) == MIN_VALUE &&
getRangeEnd(count-1) == MAX_VALUE) {
// if (getRangeCount() >= 2 &&
// getRangeStart(0) == MIN_VALUE &&
// getRangeEnd(last) == MAX_VALUE)
// Invariant: list[len-1] == HIGH == MAX_VALUE + 1
// If limit == len then len is even and the last range ends with MAX_VALUE.
if (len >= 4 && list[0] == 0 && limit == len) {
// Emit the inverse
result.append(u'^');
for (int32_t i = 1; i < count; ++i) {
UChar32 start = getRangeEnd(i-1)+1;
UChar32 end = getRangeStart(i)-1;
_appendToPat(result, start, escapeUnprintable);
if (start != end) {
if ((start+1) != end) {
result.append(u'-');
}
_appendToPat(result, end, escapeUnprintable);
}
}
// Offsetting the inversion list index by one lets us
// iterate over the ranges of the set complement.
i = 1;
--limit;
}
// Default; emit the ranges as pairs
else {
for (int32_t i = 0; i < count; ++i) {
UChar32 start = getRangeStart(i);
UChar32 end = getRangeEnd(i);
_appendToPat(result, start, escapeUnprintable);
if (start != end) {
if ((start+1) != end) {
result.append(u'-');
}
_appendToPat(result, end, escapeUnprintable);
// Emit the ranges as pairs.
while (i < limit) {
UChar32 start = list[i]; // getRangeStart()
UChar32 end = list[i + 1] - 1; // getRangeEnd() = range limit minus one
if (!(0xd800 <= end && end <= 0xdbff)) {
_appendToPat(result, start, end, escapeUnprintable);
i += 2;
} else {
// The range ends with a lead surrogate.
// Avoid writing what looks like a lead+trail surrogate pair.
// 1. Postpone ranges that start with a lead surrogate code point.
int32_t firstLead = i;
while ((i += 2) < limit && list[i] <= 0xdbff) {}
int32_t firstAfterLead = i;
// 2. Write following ranges that start with a trail surrogate code point.
while (i < limit && (start = list[i]) <= 0xdfff) {
_appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
i += 2;
}
// 3. Now write the postponed ranges.
for (int j = firstLead; j < firstAfterLead; j += 2) {
_appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
}
}
}

View file

@ -65,38 +65,52 @@ UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
return result;
}
/**
* Return true if the character is NOT printable ASCII.
*/
UBool ICU_Utility::isUnprintable(UChar32 c) {
return !(c >= 0x20 && c <= 0x7E);
}
/**
* Escape unprintable characters using \uxxxx notation for U+0000 to
* U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is
* printable ASCII, then do nothing and return FALSE. Otherwise,
* append the escaped notation and return TRUE.
*/
UBool ICU_Utility::shouldAlwaysBeEscaped(UChar32 c) {
if (c < 0x20) {
return true; // C0 control codes
} else if (c <= 0x7e) {
return false; // printable ASCII
} else if (c <= 0x9f) {
return true; // C1 control codes
} else if (c < 0xd800) {
return false; // most of the BMP
} else if (c <= 0xdfff || (0xfdd0 <= c && c <= 0xfdef) || (c & 0xfffe) == 0xfffe) {
return true; // surrogate or noncharacter code points
} else if (c <= 0x10ffff) {
return false; // all else
} else {
return true; // not a code point
}
}
UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
if (isUnprintable(c)) {
result.append(BACKSLASH);
if (c & ~0xFFFF) {
result.append(UPPER_U);
result.append(DIGITS[0xF&(c>>28)]);
result.append(DIGITS[0xF&(c>>24)]);
result.append(DIGITS[0xF&(c>>20)]);
result.append(DIGITS[0xF&(c>>16)]);
} else {
result.append(LOWER_U);
}
result.append(DIGITS[0xF&(c>>12)]);
result.append(DIGITS[0xF&(c>>8)]);
result.append(DIGITS[0xF&(c>>4)]);
result.append(DIGITS[0xF&c]);
return TRUE;
escape(result, c);
return true;
}
return FALSE;
return false;
}
UnicodeString &ICU_Utility::escape(UnicodeString& result, UChar32 c) {
result.append(BACKSLASH);
if (c & ~0xFFFF) {
result.append(UPPER_U);
result.append(DIGITS[0xF&(c>>28)]);
result.append(DIGITS[0xF&(c>>24)]);
result.append(DIGITS[0xF&(c>>20)]);
result.append(DIGITS[0xF&(c>>16)]);
} else {
result.append(LOWER_U);
}
result.append(DIGITS[0xF&(c>>12)]);
result.append(DIGITS[0xF&(c>>8)]);
result.append(DIGITS[0xF&(c>>4)]);
result.append(DIGITS[0xF&c]);
return result;
}
/**

View file

@ -55,20 +55,30 @@ class U_COMMON_API ICU_Utility /* not : public UObject because all methods are s
/**
* Return true if the character is NOT printable ASCII.
*
* This method should really be in UnicodeString (or similar). For
* now, we implement it here and share it with friend classes.
* The tab, newline and linefeed characters are considered unprintable.
*/
static UBool isUnprintable(UChar32 c);
/**
* Escape unprintable characters using \uxxxx notation for U+0000 to
* @return true for control codes and for surrogate and noncharacter code points
*/
static UBool shouldAlwaysBeEscaped(UChar32 c);
/**
* Escapes one unprintable code point using \uxxxx notation for U+0000 to
* U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is
* printable ASCII, then do nothing and return false. Otherwise,
* append the escaped notation and return true.
*/
static UBool escapeUnprintable(UnicodeString& result, UChar32 c);
/**
* Escapes one code point using \uxxxx notation
* for U+0000 to U+FFFF and \Uxxxxxxxx for U+10000 and above.
* @return result
*/
static UnicodeString &escape(UnicodeString& result, UChar32 c);
/**
* Returns the index of a character, ignoring quoted text.
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be

View file

@ -94,7 +94,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE_AUTO(TestFreezable);
TESTCASE_AUTO(TestSpan);
TESTCASE_AUTO(TestStringSpan);
TESTCASE_AUTO(TestUCAUnsafeBackwards);
TESTCASE_AUTO(TestPatternWithSurrogates);
TESTCASE_AUTO(TestIntOverflow);
TESTCASE_AUTO(TestUnusedCcc);
TESTCASE_AUTO(TestDeepPattern);
@ -3908,60 +3908,47 @@ void UnicodeSetTest::TestStringSpan() {
}
}
/**
* Including collationroot.h fails here with
1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
* .. so, we skip this test on Windows.
*
* the cause is that intltest builds with /Za which disables language extensions - which means
* windows header files can't be used.
*/
#if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
#include "collationroot.h"
#include "collationtailoring.h"
#endif
void UnicodeSetTest::TestPatternWithSurrogates() {
IcuTestErrorCode errorCode(*this, "TestPatternWithSurrogates");
// Regression test for ICU-11891
UnicodeSet surrogates;
surrogates.add(0xd000, 0xd82f); // a range ending with a lead surrogate code point
surrogates.add(0xd83a); // a lead surrogate
surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
UnicodeString pat;
surrogates.toPattern(pat, false); // bad if U+D83A is immediately followed by U+DC00
UnicodeSet s2;
// was: U_MALFORMED_SET
// Java: IllegalArgumentException: Error: Invalid range at "[...\U0001E800-\uDFFF|...]"
s2.applyPattern(pat, errorCode);
if (errorCode.errIfFailureAndReset("surrogates (1) to/from pattern")) { return; }
checkEqual(surrogates, s2, "surrogates (1) to/from pattern");
void UnicodeSetTest::TestUCAUnsafeBackwards() {
#if U_PLATFORM_HAS_WIN32_API
infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
#elif !UCONFIG_NO_COLLATION
UErrorCode errorCode = U_ZERO_ERROR;
// create a range of DBFF-DC00, and in the complement form a range of DC01-DC03
surrogates.add(0xdbff).remove(0xdc01, 0xdc03);
// add a beyond-surrogates range, up to the last code point
surrogates.add(0x10affe, 0x10ffff);
surrogates.toPattern(pat, false); // bad if U+DBFF is immediately followed by U+DC00
s2.applyPattern(pat, errorCode);
if (errorCode.errIfFailureAndReset("surrogates (2) to/from pattern")) { return; }
checkEqual(surrogates, s2, "surrogates (2) to/from pattern");
// Get the unsafeBackwardsSet
const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
if(U_FAILURE(errorCode)) {
dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
return;
}
//const UVersionInfo &version = rootEntry->tailoring->version;
const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
// Test the toPattern() code path when the pattern is shorter in complement form:
// [^opposite-ranges]
surrogates.add(0, 0x6789);
surrogates.toPattern(pat, false);
s2.applyPattern(pat, errorCode);
if (errorCode.errIfFailureAndReset("surrogates (3) to/from pattern")) { return; }
checkEqual(surrogates, s2, "surrogates (3) to/from pattern");
checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
// simple test case
// TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
// TODO(ticket #11891): Port test to Java. Is this a bug there, too?
UnicodeSet surrogates;
surrogates.add(0xd83a); // a lead surrogate
surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
UnicodeString pat;
surrogates.toPattern(pat, FALSE); // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
// TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
// so that at least one type of surrogate code points are escaped,
// or (minimally) so that adjacent lead+trail surrogate code points are escaped.
errorCode = U_ZERO_ERROR;
UnicodeSet s2;
s2.applyPattern(pat, errorCode); // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
if(U_FAILURE(errorCode)) {
errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
} else {
checkEqual(surrogates, s2, "surrogates to/from pattern");
}
// This occurs in the UCA unsafe-backwards set.
checkRoundTrip(*unsafeBackwardSet);
}
#endif
// Start with a pattern, in case the original pattern is kept but
// without the extra white space.
surrogates.applyPattern(u"[\\uD83A \\uDC00-\\uDFFF]", errorCode);
if (errorCode.errIfFailureAndReset("surrogates from pattern")) { return; }
surrogates.toPattern(pat, false);
s2.applyPattern(pat, errorCode);
if (errorCode.errIfFailureAndReset("surrogates from/to/from pattern")) { return; }
checkEqual(surrogates, s2, "surrogates from/to/from pattern");
}
void UnicodeSetTest::TestIntOverflow() {

View file

@ -91,7 +91,7 @@ private:
void TestStringSpan();
void TestUCAUnsafeBackwards();
void TestPatternWithSurrogates();
void TestIntOverflow();
void TestUnusedCcc();
void TestDeepPattern();

View file

@ -1536,34 +1536,65 @@ public final class Utility {
}
/**
* Escape unprintable characters using <backslash>uxxxx notation
* @return true for control codes and for surrogate and noncharacter code points
*/
public static boolean shouldAlwaysBeEscaped(int c) {
if (c < 0x20) {
return true; // C0 control codes
} else if (c <= 0x7e) {
return false; // printable ASCII
} else if (c <= 0x9f) {
return true; // C1 control codes
} else if (c < 0xd800) {
return false; // most of the BMP
} else if (c <= 0xdfff || (0xfdd0 <= c && c <= 0xfdef) || (c & 0xfffe) == 0xfffe) {
return true; // surrogate or noncharacter code points
} else if (c <= 0x10ffff) {
return false; // all else
} else {
return true; // not a code point
}
}
/**
* Escapes one unprintable code point using <backslash>uxxxx notation
* for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
* above. If the character is printable ASCII, then do nothing
* and return FALSE. Otherwise, append the escaped notation and
* return TRUE.
*/
public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
if (isUnprintable(c)) {
escape(result, c);
return true;
}
return false;
}
/**
* Escapes one code point using <backslash>uxxxx notation
* for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and above.
* @return result
*/
public static <T extends Appendable> T escape(T result, int c) {
try {
if (isUnprintable(c)) {
result.append('\\');
if ((c & ~0xFFFF) != 0) {
result.append('U');
result.append(DIGITS[0xF&(c>>28)]);
result.append(DIGITS[0xF&(c>>24)]);
result.append(DIGITS[0xF&(c>>20)]);
result.append(DIGITS[0xF&(c>>16)]);
} else {
result.append('u');
}
result.append(DIGITS[0xF&(c>>12)]);
result.append(DIGITS[0xF&(c>>8)]);
result.append(DIGITS[0xF&(c>>4)]);
result.append(DIGITS[0xF&c]);
return true;
result.append('\\');
if ((c & ~0xFFFF) != 0) {
result.append('U');
result.append(DIGITS[0xF&(c>>28)]);
result.append(DIGITS[0xF&(c>>24)]);
result.append(DIGITS[0xF&(c>>20)]);
result.append(DIGITS[0xF&(c>>16)]);
} else {
result.append('u');
}
return false;
result.append(DIGITS[0xF&(c>>12)]);
result.append(DIGITS[0xF&(c>>8)]);
result.append(DIGITS[0xF&(c>>4)]);
result.append(DIGITS[0xF&c]);
return result;
} catch (IOException e) {
throw new IllegalIcuArgumentException(e);
throw new ICUUncheckedIOException(e);
}
}

View file

@ -650,12 +650,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
*/
private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) {
try {
if (escapeUnprintable && Utility.isUnprintable(c)) {
if (escapeUnprintable ? Utility.isUnprintable(c) : Utility.shouldAlwaysBeEscaped(c)) {
// Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
// unprintable
if (Utility.escapeUnprintable(buf, c)) {
return buf;
}
return Utility.escape(buf, c);
}
// Okay to let ':' pass through
switch (c) {
@ -685,6 +683,24 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
}
}
private static <T extends Appendable> T _appendToPat(
T result, int start, int end, boolean escapeUnprintable) {
_appendToPat(result, start, escapeUnprintable);
if (start != end) {
if ((start+1) != end ||
// Avoid writing what looks like a lead+trail surrogate pair.
start == 0xdbff) {
try {
result.append('-');
} catch (IOException e) {
throw new ICUUncheckedIOException(e);
}
}
_appendToPat(result, end, escapeUnprintable);
}
return result;
}
/**
* Returns a string representation of this set. If the result of
* calling this function is passed to a UnicodeSet constructor, it
@ -712,6 +728,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
}
try {
if (!escapeUnprintable) {
// TODO: The C++ version does not have this shortcut, and instead
// always cleans up the pattern string,
// which also escapes Utility.shouldAlwaysBeEscaped(c).
// We should sync these implementations.
result.append(pat);
return result;
}
@ -723,7 +743,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
// If the unprintable character is preceded by an odd
// number of backslashes, then it has been escaped
// and we omit the last backslash.
Utility.escapeUnprintable(result, c);
Utility.escape(result, c);
oddNumberOfBackslashes = false;
} else if (!oddNumberOfBackslashes && c == '\\') {
// Temporarily withhold an odd-numbered backslash.
@ -749,6 +769,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* Generate and append a string representation of this set to result.
* This does not use this.pat, the cleaned up copy of the string
* passed to applyPattern().
*
* @param result the buffer into which to generate the pattern
* @param escapeUnprintable escape unprintable characters if true
* @stable ICU 2.0
@ -761,6 +782,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* Generate and append a string representation of this set to result.
* This does not use this.pat, the cleaned up copy of the string
* passed to applyPattern().
*
* @param result the buffer into which to generate the pattern
* @param escapeUnprintable escape unprintable characters if true
* @param includeStrings if false, doesn't include the strings.
* @stable ICU 3.8
*/
@ -769,47 +793,55 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
return appendNewPattern(result, escapeUnprintable, includeStrings);
}
// Implementation of public _generatePattern().
// Allows other callers to use a StringBuilder while the existing API is stuck with StringBuffer.
private <T extends Appendable> T appendNewPattern(
T result, boolean escapeUnprintable, boolean includeStrings) {
try {
result.append('[');
int count = getRangeCount();
int i = 0;
int limit = len & ~1; // = 2 * getRangeCount()
// If the set contains at least 2 intervals and includes both
// MIN_VALUE and MAX_VALUE, then the inverse representation will
// be more economical.
if (count > 1 &&
getRangeStart(0) == MIN_VALUE &&
getRangeEnd(count-1) == MAX_VALUE) {
// if (getRangeCount() >= 2 &&
// getRangeStart(0) == MIN_VALUE &&
// getRangeEnd(last) == MAX_VALUE)
// Invariant: list[len-1] == HIGH == MAX_VALUE + 1
// If limit == len then len is even and the last range ends with MAX_VALUE.
if (len >= 4 && list[0] == 0 && limit == len) {
// Emit the inverse
result.append('^');
for (int i = 1; i < count; ++i) {
int start = getRangeEnd(i-1)+1;
int end = getRangeStart(i)-1;
_appendToPat(result, start, escapeUnprintable);
if (start != end) {
if ((start+1) != end) {
result.append('-');
}
_appendToPat(result, end, escapeUnprintable);
}
}
// Offsetting the inversion list index by one lets us
// iterate over the ranges of the set complement.
i = 1;
--limit;
}
// Default; emit the ranges as pairs
else {
for (int i = 0; i < count; ++i) {
int start = getRangeStart(i);
int end = getRangeEnd(i);
_appendToPat(result, start, escapeUnprintable);
if (start != end) {
if ((start+1) != end) {
result.append('-');
}
_appendToPat(result, end, escapeUnprintable);
// Emit the ranges as pairs.
while (i < limit) {
int start = list[i]; // getRangeStart()
int end = list[i + 1] - 1; // getRangeEnd() = range limit minus one
if (!(0xd800 <= end && end <= 0xdbff)) {
_appendToPat(result, start, end, escapeUnprintable);
i += 2;
} else {
// The range ends with a lead surrogate.
// Avoid writing what looks like a lead+trail surrogate pair.
// 1. Postpone ranges that start with a lead surrogate code point.
int firstLead = i;
while ((i += 2) < limit && list[i] <= 0xdbff) {}
int firstAfterLead = i;
// 2. Write following ranges that start with a trail surrogate code point.
while (i < limit && (start = list[i]) <= 0xdfff) {
_appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
i += 2;
}
// 3. Now write the postponed ranges.
for (int j = firstLead; j < firstAfterLead; j += 2) {
_appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
}
}
}

View file

@ -2752,6 +2752,42 @@ public class UnicodeSetTest extends TestFmwk {
UnicodeSet.fromAll("b").compareTo(Collections.singleton("a")) > 0);
}
@Test
public void TestPatternWithSurrogates() {
// Regression test for ICU-11891
UnicodeSet surrogates = new UnicodeSet();
surrogates.add(0xd000, 0xd82f); // a range ending with a lead surrogate code point
surrogates.add(0xd83a); // a lead surrogate
surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
String pat = surrogates.toPattern(false); // bad if U+D83A is immediately followed by U+DC00
UnicodeSet s2 = new UnicodeSet();
// was: IllegalArgumentException: Error: Invalid range at "[...\U0001E800-\uDFFF|...]"
s2.applyPattern(pat);
checkEqual(surrogates, s2, "surrogates (1) to/from pattern");
// create a range of DBFF-DC00, and in the complement form a range of DC01-DC03
surrogates.add(0xdbff).remove(0xdc01, 0xdc03);
// add a beyond-surrogates range, up to the last code point
surrogates.add(0x10affe, 0x10ffff);
pat = surrogates.toPattern(false); // bad if U+DBFF is immediately followed by U+DC00
s2.applyPattern(pat);
checkEqual(surrogates, s2, "surrogates (2) to/from pattern");
// Test the toPattern() code path when the pattern is shorter in complement form:
// [^opposite-ranges]
surrogates.add(0, 0x6789);
pat = surrogates.toPattern(false);
s2.applyPattern(pat);
checkEqual(surrogates, s2, "surrogates (3) to/from pattern");
// Start with a pattern, in case the original pattern is kept but
// without the extra white space.
surrogates.applyPattern("[\\uD83A \\uDC00-\\uDFFF]");
pat = surrogates.toPattern(false);
s2.applyPattern(pat);
checkEqual(surrogates, s2, "surrogates from/to/from pattern");
}
@Test
public void TestUnusedCcc() {
// All numeric ccc values 0..255 are valid, but many are unused.

View file

@ -79,8 +79,13 @@ public class RegexUtilitiesTest extends TestFmwk {
errln(e.getMessage());
continue;
}
final String expected = "[" + s + "]";
assertEquals("Doubled character works" + hex.transform(s), expected, pattern);
String expected = "[" + s + "]"; // Try this first for faster testing.
boolean ok = pattern.equals(expected);
if (!ok) {
expected = new UnicodeSet(expected).toPattern(false);
ok = pattern.equals(expected);
}
assertTrue("Doubled character works " + hex.transform(s), ok);
// verify that we can create a regex pattern and use as expected
String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000);