mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 05:55:35 +00:00
ICU-11891 make UnicodeSet.toPattern() well-formed & round-trip
This commit is contained in:
parent
43276e8c34
commit
3f0e003901
10 changed files with 309 additions and 184 deletions
|
@ -1597,6 +1597,9 @@ private:
|
|||
|
||||
static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
|
||||
|
||||
static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
|
||||
UBool escapeUnprintable);
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Implementation: Fundamental operators
|
||||
//----------------------------------------------------------------
|
||||
|
|
|
@ -1967,8 +1967,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity)
|
|||
* Append the <code>toPattern()</code> representation of a
|
||||
* string to the given <code>StringBuffer</code>.
|
||||
*/
|
||||
void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
|
||||
escapeUnprintable) {
|
||||
void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
|
||||
UChar32 cp;
|
||||
for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
|
||||
_appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
|
||||
|
@ -1979,14 +1978,12 @@ escapeUnprintable) {
|
|||
* Append the <code>toPattern()</code> representation of a
|
||||
* character to the given <code>StringBuffer</code>.
|
||||
*/
|
||||
void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
|
||||
escapeUnprintable) {
|
||||
if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
|
||||
void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
|
||||
if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
|
||||
// Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
|
||||
// unprintable
|
||||
if (ICU_Utility::escapeUnprintable(buf, c)) {
|
||||
return;
|
||||
}
|
||||
ICU_Utility::escape(buf, c);
|
||||
return;
|
||||
}
|
||||
// Okay to let ':' pass through
|
||||
switch (c) {
|
||||
|
@ -2012,6 +2009,19 @@ escapeUnprintable) {
|
|||
buf.append(c);
|
||||
}
|
||||
|
||||
void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
|
||||
UBool escapeUnprintable) {
|
||||
_appendToPat(result, start, escapeUnprintable);
|
||||
if (start != end) {
|
||||
if ((start+1) != end ||
|
||||
// Avoid writing what looks like a lead+trail surrogate pair.
|
||||
start == 0xdbff) {
|
||||
result.append(u'-');
|
||||
}
|
||||
_appendToPat(result, end, escapeUnprintable);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Append a string representation of this set to result. This will be
|
||||
* a cleaned version of the string passed to applyPattern(), if there
|
||||
|
@ -2026,7 +2036,8 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
|
|||
for (i=0; i<patLen; ) {
|
||||
UChar32 c;
|
||||
U16_NEXT(pat, i, patLen, c);
|
||||
if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
|
||||
if (escapeUnprintable ?
|
||||
ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
|
||||
// If the unprintable character is preceded by an odd
|
||||
// number of backslashes, then it has been escaped.
|
||||
// Before unescaping it, we delete the final
|
||||
|
@ -2034,7 +2045,7 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
|
|||
if ((backslashCount % 2) == 1) {
|
||||
result.truncate(result.length() - 1);
|
||||
}
|
||||
ICU_Utility::escapeUnprintable(result, c);
|
||||
ICU_Utility::escape(result, c);
|
||||
backslashCount = 0;
|
||||
} else {
|
||||
result.append(c);
|
||||
|
@ -2073,52 +2084,48 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
|
|||
{
|
||||
result.append(u'[');
|
||||
|
||||
// // Check against the predefined categories. We implicitly build
|
||||
// // up ALL category sets the first time toPattern() is called.
|
||||
// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
|
||||
// if (*this == getCategorySet(cat)) {
|
||||
// result.append(u':');
|
||||
// result.append(CATEGORY_NAMES, cat*2, 2);
|
||||
// return result.append(CATEGORY_CLOSE);
|
||||
// }
|
||||
// }
|
||||
|
||||
int32_t count = getRangeCount();
|
||||
int32_t i = 0;
|
||||
int32_t limit = len & ~1; // = 2 * getRangeCount()
|
||||
|
||||
// If the set contains at least 2 intervals and includes both
|
||||
// MIN_VALUE and MAX_VALUE, then the inverse representation will
|
||||
// be more economical.
|
||||
if (count > 1 &&
|
||||
getRangeStart(0) == MIN_VALUE &&
|
||||
getRangeEnd(count-1) == MAX_VALUE) {
|
||||
|
||||
// if (getRangeCount() >= 2 &&
|
||||
// getRangeStart(0) == MIN_VALUE &&
|
||||
// getRangeEnd(last) == MAX_VALUE)
|
||||
// Invariant: list[len-1] == HIGH == MAX_VALUE + 1
|
||||
// If limit == len then len is even and the last range ends with MAX_VALUE.
|
||||
if (len >= 4 && list[0] == 0 && limit == len) {
|
||||
// Emit the inverse
|
||||
result.append(u'^');
|
||||
|
||||
for (int32_t i = 1; i < count; ++i) {
|
||||
UChar32 start = getRangeEnd(i-1)+1;
|
||||
UChar32 end = getRangeStart(i)-1;
|
||||
_appendToPat(result, start, escapeUnprintable);
|
||||
if (start != end) {
|
||||
if ((start+1) != end) {
|
||||
result.append(u'-');
|
||||
}
|
||||
_appendToPat(result, end, escapeUnprintable);
|
||||
}
|
||||
}
|
||||
// Offsetting the inversion list index by one lets us
|
||||
// iterate over the ranges of the set complement.
|
||||
i = 1;
|
||||
--limit;
|
||||
}
|
||||
|
||||
// Default; emit the ranges as pairs
|
||||
else {
|
||||
for (int32_t i = 0; i < count; ++i) {
|
||||
UChar32 start = getRangeStart(i);
|
||||
UChar32 end = getRangeEnd(i);
|
||||
_appendToPat(result, start, escapeUnprintable);
|
||||
if (start != end) {
|
||||
if ((start+1) != end) {
|
||||
result.append(u'-');
|
||||
}
|
||||
_appendToPat(result, end, escapeUnprintable);
|
||||
// Emit the ranges as pairs.
|
||||
while (i < limit) {
|
||||
UChar32 start = list[i]; // getRangeStart()
|
||||
UChar32 end = list[i + 1] - 1; // getRangeEnd() = range limit minus one
|
||||
if (!(0xd800 <= end && end <= 0xdbff)) {
|
||||
_appendToPat(result, start, end, escapeUnprintable);
|
||||
i += 2;
|
||||
} else {
|
||||
// The range ends with a lead surrogate.
|
||||
// Avoid writing what looks like a lead+trail surrogate pair.
|
||||
// 1. Postpone ranges that start with a lead surrogate code point.
|
||||
int32_t firstLead = i;
|
||||
while ((i += 2) < limit && list[i] <= 0xdbff) {}
|
||||
int32_t firstAfterLead = i;
|
||||
// 2. Write following ranges that start with a trail surrogate code point.
|
||||
while (i < limit && (start = list[i]) <= 0xdfff) {
|
||||
_appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
|
||||
i += 2;
|
||||
}
|
||||
// 3. Now write the postponed ranges.
|
||||
for (int j = firstLead; j < firstAfterLead; j += 2) {
|
||||
_appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -65,38 +65,52 @@ UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
|
|||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the character is NOT printable ASCII.
|
||||
*/
|
||||
UBool ICU_Utility::isUnprintable(UChar32 c) {
|
||||
return !(c >= 0x20 && c <= 0x7E);
|
||||
}
|
||||
|
||||
/**
|
||||
* Escape unprintable characters using \uxxxx notation for U+0000 to
|
||||
* U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is
|
||||
* printable ASCII, then do nothing and return FALSE. Otherwise,
|
||||
* append the escaped notation and return TRUE.
|
||||
*/
|
||||
UBool ICU_Utility::shouldAlwaysBeEscaped(UChar32 c) {
|
||||
if (c < 0x20) {
|
||||
return true; // C0 control codes
|
||||
} else if (c <= 0x7e) {
|
||||
return false; // printable ASCII
|
||||
} else if (c <= 0x9f) {
|
||||
return true; // C1 control codes
|
||||
} else if (c < 0xd800) {
|
||||
return false; // most of the BMP
|
||||
} else if (c <= 0xdfff || (0xfdd0 <= c && c <= 0xfdef) || (c & 0xfffe) == 0xfffe) {
|
||||
return true; // surrogate or noncharacter code points
|
||||
} else if (c <= 0x10ffff) {
|
||||
return false; // all else
|
||||
} else {
|
||||
return true; // not a code point
|
||||
}
|
||||
}
|
||||
|
||||
UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
|
||||
if (isUnprintable(c)) {
|
||||
result.append(BACKSLASH);
|
||||
if (c & ~0xFFFF) {
|
||||
result.append(UPPER_U);
|
||||
result.append(DIGITS[0xF&(c>>28)]);
|
||||
result.append(DIGITS[0xF&(c>>24)]);
|
||||
result.append(DIGITS[0xF&(c>>20)]);
|
||||
result.append(DIGITS[0xF&(c>>16)]);
|
||||
} else {
|
||||
result.append(LOWER_U);
|
||||
}
|
||||
result.append(DIGITS[0xF&(c>>12)]);
|
||||
result.append(DIGITS[0xF&(c>>8)]);
|
||||
result.append(DIGITS[0xF&(c>>4)]);
|
||||
result.append(DIGITS[0xF&c]);
|
||||
return TRUE;
|
||||
escape(result, c);
|
||||
return true;
|
||||
}
|
||||
return FALSE;
|
||||
return false;
|
||||
}
|
||||
|
||||
UnicodeString &ICU_Utility::escape(UnicodeString& result, UChar32 c) {
|
||||
result.append(BACKSLASH);
|
||||
if (c & ~0xFFFF) {
|
||||
result.append(UPPER_U);
|
||||
result.append(DIGITS[0xF&(c>>28)]);
|
||||
result.append(DIGITS[0xF&(c>>24)]);
|
||||
result.append(DIGITS[0xF&(c>>20)]);
|
||||
result.append(DIGITS[0xF&(c>>16)]);
|
||||
} else {
|
||||
result.append(LOWER_U);
|
||||
}
|
||||
result.append(DIGITS[0xF&(c>>12)]);
|
||||
result.append(DIGITS[0xF&(c>>8)]);
|
||||
result.append(DIGITS[0xF&(c>>4)]);
|
||||
result.append(DIGITS[0xF&c]);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -55,20 +55,30 @@ class U_COMMON_API ICU_Utility /* not : public UObject because all methods are s
|
|||
|
||||
/**
|
||||
* Return true if the character is NOT printable ASCII.
|
||||
*
|
||||
* This method should really be in UnicodeString (or similar). For
|
||||
* now, we implement it here and share it with friend classes.
|
||||
* The tab, newline and linefeed characters are considered unprintable.
|
||||
*/
|
||||
static UBool isUnprintable(UChar32 c);
|
||||
|
||||
/**
|
||||
* Escape unprintable characters using \uxxxx notation for U+0000 to
|
||||
* @return true for control codes and for surrogate and noncharacter code points
|
||||
*/
|
||||
static UBool shouldAlwaysBeEscaped(UChar32 c);
|
||||
|
||||
/**
|
||||
* Escapes one unprintable code point using \uxxxx notation for U+0000 to
|
||||
* U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is
|
||||
* printable ASCII, then do nothing and return false. Otherwise,
|
||||
* append the escaped notation and return true.
|
||||
*/
|
||||
static UBool escapeUnprintable(UnicodeString& result, UChar32 c);
|
||||
|
||||
/**
|
||||
* Escapes one code point using \uxxxx notation
|
||||
* for U+0000 to U+FFFF and \Uxxxxxxxx for U+10000 and above.
|
||||
* @return result
|
||||
*/
|
||||
static UnicodeString &escape(UnicodeString& result, UChar32 c);
|
||||
|
||||
/**
|
||||
* Returns the index of a character, ignoring quoted text.
|
||||
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
|
||||
|
|
|
@ -94,7 +94,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
|
|||
TESTCASE_AUTO(TestFreezable);
|
||||
TESTCASE_AUTO(TestSpan);
|
||||
TESTCASE_AUTO(TestStringSpan);
|
||||
TESTCASE_AUTO(TestUCAUnsafeBackwards);
|
||||
TESTCASE_AUTO(TestPatternWithSurrogates);
|
||||
TESTCASE_AUTO(TestIntOverflow);
|
||||
TESTCASE_AUTO(TestUnusedCcc);
|
||||
TESTCASE_AUTO(TestDeepPattern);
|
||||
|
@ -3908,60 +3908,47 @@ void UnicodeSetTest::TestStringSpan() {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Including collationroot.h fails here with
|
||||
1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
|
||||
* .. so, we skip this test on Windows.
|
||||
*
|
||||
* the cause is that intltest builds with /Za which disables language extensions - which means
|
||||
* windows header files can't be used.
|
||||
*/
|
||||
#if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
|
||||
#include "collationroot.h"
|
||||
#include "collationtailoring.h"
|
||||
#endif
|
||||
void UnicodeSetTest::TestPatternWithSurrogates() {
|
||||
IcuTestErrorCode errorCode(*this, "TestPatternWithSurrogates");
|
||||
// Regression test for ICU-11891
|
||||
UnicodeSet surrogates;
|
||||
surrogates.add(0xd000, 0xd82f); // a range ending with a lead surrogate code point
|
||||
surrogates.add(0xd83a); // a lead surrogate
|
||||
surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
|
||||
UnicodeString pat;
|
||||
surrogates.toPattern(pat, false); // bad if U+D83A is immediately followed by U+DC00
|
||||
UnicodeSet s2;
|
||||
// was: U_MALFORMED_SET
|
||||
// Java: IllegalArgumentException: Error: Invalid range at "[...\U0001E800-\uDFFF|...]"
|
||||
s2.applyPattern(pat, errorCode);
|
||||
if (errorCode.errIfFailureAndReset("surrogates (1) to/from pattern")) { return; }
|
||||
checkEqual(surrogates, s2, "surrogates (1) to/from pattern");
|
||||
|
||||
void UnicodeSetTest::TestUCAUnsafeBackwards() {
|
||||
#if U_PLATFORM_HAS_WIN32_API
|
||||
infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
|
||||
#elif !UCONFIG_NO_COLLATION
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
// create a range of DBFF-DC00, and in the complement form a range of DC01-DC03
|
||||
surrogates.add(0xdbff).remove(0xdc01, 0xdc03);
|
||||
// add a beyond-surrogates range, up to the last code point
|
||||
surrogates.add(0x10affe, 0x10ffff);
|
||||
surrogates.toPattern(pat, false); // bad if U+DBFF is immediately followed by U+DC00
|
||||
s2.applyPattern(pat, errorCode);
|
||||
if (errorCode.errIfFailureAndReset("surrogates (2) to/from pattern")) { return; }
|
||||
checkEqual(surrogates, s2, "surrogates (2) to/from pattern");
|
||||
|
||||
// Get the unsafeBackwardsSet
|
||||
const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
//const UVersionInfo &version = rootEntry->tailoring->version;
|
||||
const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
|
||||
// Test the toPattern() code path when the pattern is shorter in complement form:
|
||||
// [^opposite-ranges]
|
||||
surrogates.add(0, 0x6789);
|
||||
surrogates.toPattern(pat, false);
|
||||
s2.applyPattern(pat, errorCode);
|
||||
if (errorCode.errIfFailureAndReset("surrogates (3) to/from pattern")) { return; }
|
||||
checkEqual(surrogates, s2, "surrogates (3) to/from pattern");
|
||||
|
||||
checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
|
||||
|
||||
if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
|
||||
// simple test case
|
||||
// TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
|
||||
// TODO(ticket #11891): Port test to Java. Is this a bug there, too?
|
||||
UnicodeSet surrogates;
|
||||
surrogates.add(0xd83a); // a lead surrogate
|
||||
surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
|
||||
UnicodeString pat;
|
||||
surrogates.toPattern(pat, FALSE); // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
|
||||
// TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
|
||||
// so that at least one type of surrogate code points are escaped,
|
||||
// or (minimally) so that adjacent lead+trail surrogate code points are escaped.
|
||||
errorCode = U_ZERO_ERROR;
|
||||
UnicodeSet s2;
|
||||
s2.applyPattern(pat, errorCode); // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
|
||||
if(U_FAILURE(errorCode)) {
|
||||
errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
|
||||
} else {
|
||||
checkEqual(surrogates, s2, "surrogates to/from pattern");
|
||||
}
|
||||
// This occurs in the UCA unsafe-backwards set.
|
||||
checkRoundTrip(*unsafeBackwardSet);
|
||||
}
|
||||
#endif
|
||||
// Start with a pattern, in case the original pattern is kept but
|
||||
// without the extra white space.
|
||||
surrogates.applyPattern(u"[\\uD83A \\uDC00-\\uDFFF]", errorCode);
|
||||
if (errorCode.errIfFailureAndReset("surrogates from pattern")) { return; }
|
||||
surrogates.toPattern(pat, false);
|
||||
s2.applyPattern(pat, errorCode);
|
||||
if (errorCode.errIfFailureAndReset("surrogates from/to/from pattern")) { return; }
|
||||
checkEqual(surrogates, s2, "surrogates from/to/from pattern");
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestIntOverflow() {
|
||||
|
|
|
@ -91,7 +91,7 @@ private:
|
|||
|
||||
void TestStringSpan();
|
||||
|
||||
void TestUCAUnsafeBackwards();
|
||||
void TestPatternWithSurrogates();
|
||||
void TestIntOverflow();
|
||||
void TestUnusedCcc();
|
||||
void TestDeepPattern();
|
||||
|
|
|
@ -1536,34 +1536,65 @@ public final class Utility {
|
|||
}
|
||||
|
||||
/**
|
||||
* Escape unprintable characters using <backslash>uxxxx notation
|
||||
* @return true for control codes and for surrogate and noncharacter code points
|
||||
*/
|
||||
public static boolean shouldAlwaysBeEscaped(int c) {
|
||||
if (c < 0x20) {
|
||||
return true; // C0 control codes
|
||||
} else if (c <= 0x7e) {
|
||||
return false; // printable ASCII
|
||||
} else if (c <= 0x9f) {
|
||||
return true; // C1 control codes
|
||||
} else if (c < 0xd800) {
|
||||
return false; // most of the BMP
|
||||
} else if (c <= 0xdfff || (0xfdd0 <= c && c <= 0xfdef) || (c & 0xfffe) == 0xfffe) {
|
||||
return true; // surrogate or noncharacter code points
|
||||
} else if (c <= 0x10ffff) {
|
||||
return false; // all else
|
||||
} else {
|
||||
return true; // not a code point
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes one unprintable code point using <backslash>uxxxx notation
|
||||
* for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
|
||||
* above. If the character is printable ASCII, then do nothing
|
||||
* and return FALSE. Otherwise, append the escaped notation and
|
||||
* return TRUE.
|
||||
*/
|
||||
public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
|
||||
if (isUnprintable(c)) {
|
||||
escape(result, c);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes one code point using <backslash>uxxxx notation
|
||||
* for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and above.
|
||||
* @return result
|
||||
*/
|
||||
public static <T extends Appendable> T escape(T result, int c) {
|
||||
try {
|
||||
if (isUnprintable(c)) {
|
||||
result.append('\\');
|
||||
if ((c & ~0xFFFF) != 0) {
|
||||
result.append('U');
|
||||
result.append(DIGITS[0xF&(c>>28)]);
|
||||
result.append(DIGITS[0xF&(c>>24)]);
|
||||
result.append(DIGITS[0xF&(c>>20)]);
|
||||
result.append(DIGITS[0xF&(c>>16)]);
|
||||
} else {
|
||||
result.append('u');
|
||||
}
|
||||
result.append(DIGITS[0xF&(c>>12)]);
|
||||
result.append(DIGITS[0xF&(c>>8)]);
|
||||
result.append(DIGITS[0xF&(c>>4)]);
|
||||
result.append(DIGITS[0xF&c]);
|
||||
return true;
|
||||
result.append('\\');
|
||||
if ((c & ~0xFFFF) != 0) {
|
||||
result.append('U');
|
||||
result.append(DIGITS[0xF&(c>>28)]);
|
||||
result.append(DIGITS[0xF&(c>>24)]);
|
||||
result.append(DIGITS[0xF&(c>>20)]);
|
||||
result.append(DIGITS[0xF&(c>>16)]);
|
||||
} else {
|
||||
result.append('u');
|
||||
}
|
||||
return false;
|
||||
result.append(DIGITS[0xF&(c>>12)]);
|
||||
result.append(DIGITS[0xF&(c>>8)]);
|
||||
result.append(DIGITS[0xF&(c>>4)]);
|
||||
result.append(DIGITS[0xF&c]);
|
||||
return result;
|
||||
} catch (IOException e) {
|
||||
throw new IllegalIcuArgumentException(e);
|
||||
throw new ICUUncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -650,12 +650,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
*/
|
||||
private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) {
|
||||
try {
|
||||
if (escapeUnprintable && Utility.isUnprintable(c)) {
|
||||
if (escapeUnprintable ? Utility.isUnprintable(c) : Utility.shouldAlwaysBeEscaped(c)) {
|
||||
// Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
|
||||
// unprintable
|
||||
if (Utility.escapeUnprintable(buf, c)) {
|
||||
return buf;
|
||||
}
|
||||
return Utility.escape(buf, c);
|
||||
}
|
||||
// Okay to let ':' pass through
|
||||
switch (c) {
|
||||
|
@ -685,6 +683,24 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
}
|
||||
}
|
||||
|
||||
private static <T extends Appendable> T _appendToPat(
|
||||
T result, int start, int end, boolean escapeUnprintable) {
|
||||
_appendToPat(result, start, escapeUnprintable);
|
||||
if (start != end) {
|
||||
if ((start+1) != end ||
|
||||
// Avoid writing what looks like a lead+trail surrogate pair.
|
||||
start == 0xdbff) {
|
||||
try {
|
||||
result.append('-');
|
||||
} catch (IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
_appendToPat(result, end, escapeUnprintable);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string representation of this set. If the result of
|
||||
* calling this function is passed to a UnicodeSet constructor, it
|
||||
|
@ -712,6 +728,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
}
|
||||
try {
|
||||
if (!escapeUnprintable) {
|
||||
// TODO: The C++ version does not have this shortcut, and instead
|
||||
// always cleans up the pattern string,
|
||||
// which also escapes Utility.shouldAlwaysBeEscaped(c).
|
||||
// We should sync these implementations.
|
||||
result.append(pat);
|
||||
return result;
|
||||
}
|
||||
|
@ -723,7 +743,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
// If the unprintable character is preceded by an odd
|
||||
// number of backslashes, then it has been escaped
|
||||
// and we omit the last backslash.
|
||||
Utility.escapeUnprintable(result, c);
|
||||
Utility.escape(result, c);
|
||||
oddNumberOfBackslashes = false;
|
||||
} else if (!oddNumberOfBackslashes && c == '\\') {
|
||||
// Temporarily withhold an odd-numbered backslash.
|
||||
|
@ -749,6 +769,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* Generate and append a string representation of this set to result.
|
||||
* This does not use this.pat, the cleaned up copy of the string
|
||||
* passed to applyPattern().
|
||||
*
|
||||
* @param result the buffer into which to generate the pattern
|
||||
* @param escapeUnprintable escape unprintable characters if true
|
||||
* @stable ICU 2.0
|
||||
|
@ -761,6 +782,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* Generate and append a string representation of this set to result.
|
||||
* This does not use this.pat, the cleaned up copy of the string
|
||||
* passed to applyPattern().
|
||||
*
|
||||
* @param result the buffer into which to generate the pattern
|
||||
* @param escapeUnprintable escape unprintable characters if true
|
||||
* @param includeStrings if false, doesn't include the strings.
|
||||
* @stable ICU 3.8
|
||||
*/
|
||||
|
@ -769,47 +793,55 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
return appendNewPattern(result, escapeUnprintable, includeStrings);
|
||||
}
|
||||
|
||||
// Implementation of public _generatePattern().
|
||||
// Allows other callers to use a StringBuilder while the existing API is stuck with StringBuffer.
|
||||
private <T extends Appendable> T appendNewPattern(
|
||||
T result, boolean escapeUnprintable, boolean includeStrings) {
|
||||
try {
|
||||
result.append('[');
|
||||
|
||||
int count = getRangeCount();
|
||||
int i = 0;
|
||||
int limit = len & ~1; // = 2 * getRangeCount()
|
||||
|
||||
// If the set contains at least 2 intervals and includes both
|
||||
// MIN_VALUE and MAX_VALUE, then the inverse representation will
|
||||
// be more economical.
|
||||
if (count > 1 &&
|
||||
getRangeStart(0) == MIN_VALUE &&
|
||||
getRangeEnd(count-1) == MAX_VALUE) {
|
||||
|
||||
// if (getRangeCount() >= 2 &&
|
||||
// getRangeStart(0) == MIN_VALUE &&
|
||||
// getRangeEnd(last) == MAX_VALUE)
|
||||
// Invariant: list[len-1] == HIGH == MAX_VALUE + 1
|
||||
// If limit == len then len is even and the last range ends with MAX_VALUE.
|
||||
if (len >= 4 && list[0] == 0 && limit == len) {
|
||||
// Emit the inverse
|
||||
result.append('^');
|
||||
|
||||
for (int i = 1; i < count; ++i) {
|
||||
int start = getRangeEnd(i-1)+1;
|
||||
int end = getRangeStart(i)-1;
|
||||
_appendToPat(result, start, escapeUnprintable);
|
||||
if (start != end) {
|
||||
if ((start+1) != end) {
|
||||
result.append('-');
|
||||
}
|
||||
_appendToPat(result, end, escapeUnprintable);
|
||||
}
|
||||
}
|
||||
// Offsetting the inversion list index by one lets us
|
||||
// iterate over the ranges of the set complement.
|
||||
i = 1;
|
||||
--limit;
|
||||
}
|
||||
|
||||
// Default; emit the ranges as pairs
|
||||
else {
|
||||
for (int i = 0; i < count; ++i) {
|
||||
int start = getRangeStart(i);
|
||||
int end = getRangeEnd(i);
|
||||
_appendToPat(result, start, escapeUnprintable);
|
||||
if (start != end) {
|
||||
if ((start+1) != end) {
|
||||
result.append('-');
|
||||
}
|
||||
_appendToPat(result, end, escapeUnprintable);
|
||||
// Emit the ranges as pairs.
|
||||
while (i < limit) {
|
||||
int start = list[i]; // getRangeStart()
|
||||
int end = list[i + 1] - 1; // getRangeEnd() = range limit minus one
|
||||
if (!(0xd800 <= end && end <= 0xdbff)) {
|
||||
_appendToPat(result, start, end, escapeUnprintable);
|
||||
i += 2;
|
||||
} else {
|
||||
// The range ends with a lead surrogate.
|
||||
// Avoid writing what looks like a lead+trail surrogate pair.
|
||||
// 1. Postpone ranges that start with a lead surrogate code point.
|
||||
int firstLead = i;
|
||||
while ((i += 2) < limit && list[i] <= 0xdbff) {}
|
||||
int firstAfterLead = i;
|
||||
// 2. Write following ranges that start with a trail surrogate code point.
|
||||
while (i < limit && (start = list[i]) <= 0xdfff) {
|
||||
_appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
|
||||
i += 2;
|
||||
}
|
||||
// 3. Now write the postponed ranges.
|
||||
for (int j = firstLead; j < firstAfterLead; j += 2) {
|
||||
_appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2752,6 +2752,42 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
UnicodeSet.fromAll("b").compareTo(Collections.singleton("a")) > 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestPatternWithSurrogates() {
|
||||
// Regression test for ICU-11891
|
||||
UnicodeSet surrogates = new UnicodeSet();
|
||||
surrogates.add(0xd000, 0xd82f); // a range ending with a lead surrogate code point
|
||||
surrogates.add(0xd83a); // a lead surrogate
|
||||
surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
|
||||
String pat = surrogates.toPattern(false); // bad if U+D83A is immediately followed by U+DC00
|
||||
UnicodeSet s2 = new UnicodeSet();
|
||||
// was: IllegalArgumentException: Error: Invalid range at "[...\U0001E800-\uDFFF|...]"
|
||||
s2.applyPattern(pat);
|
||||
checkEqual(surrogates, s2, "surrogates (1) to/from pattern");
|
||||
|
||||
// create a range of DBFF-DC00, and in the complement form a range of DC01-DC03
|
||||
surrogates.add(0xdbff).remove(0xdc01, 0xdc03);
|
||||
// add a beyond-surrogates range, up to the last code point
|
||||
surrogates.add(0x10affe, 0x10ffff);
|
||||
pat = surrogates.toPattern(false); // bad if U+DBFF is immediately followed by U+DC00
|
||||
s2.applyPattern(pat);
|
||||
checkEqual(surrogates, s2, "surrogates (2) to/from pattern");
|
||||
|
||||
// Test the toPattern() code path when the pattern is shorter in complement form:
|
||||
// [^opposite-ranges]
|
||||
surrogates.add(0, 0x6789);
|
||||
pat = surrogates.toPattern(false);
|
||||
s2.applyPattern(pat);
|
||||
checkEqual(surrogates, s2, "surrogates (3) to/from pattern");
|
||||
|
||||
// Start with a pattern, in case the original pattern is kept but
|
||||
// without the extra white space.
|
||||
surrogates.applyPattern("[\\uD83A \\uDC00-\\uDFFF]");
|
||||
pat = surrogates.toPattern(false);
|
||||
s2.applyPattern(pat);
|
||||
checkEqual(surrogates, s2, "surrogates from/to/from pattern");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestUnusedCcc() {
|
||||
// All numeric ccc values 0..255 are valid, but many are unused.
|
||||
|
|
|
@ -79,8 +79,13 @@ public class RegexUtilitiesTest extends TestFmwk {
|
|||
errln(e.getMessage());
|
||||
continue;
|
||||
}
|
||||
final String expected = "[" + s + "]";
|
||||
assertEquals("Doubled character works" + hex.transform(s), expected, pattern);
|
||||
String expected = "[" + s + "]"; // Try this first for faster testing.
|
||||
boolean ok = pattern.equals(expected);
|
||||
if (!ok) {
|
||||
expected = new UnicodeSet(expected).toPattern(false);
|
||||
ok = pattern.equals(expected);
|
||||
}
|
||||
assertTrue("Doubled character works " + hex.transform(s), ok);
|
||||
|
||||
// verify that we can create a regex pattern and use as expected
|
||||
String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000);
|
||||
|
|
Loading…
Add table
Reference in a new issue