ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits

This commit is contained in:
Markus Scherer 2021-08-24 12:42:40 -07:00
parent ae3f2ae7ca
commit 486e2d36ac
6 changed files with 103 additions and 53 deletions
icu4c/source
common
test/intltest
icu4j/main
classes/core/src/com/ibm/icu
tests/core/src/com/ibm/icu/dev/test/util

View file

@ -217,9 +217,8 @@ class RuleCharacterIterator;
* </tr>
* <tr align="top">
* <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
* <td valign="top"><em>any character for which
* </em><code>Character.digit(c, 16)</code><em>
* returns a non-negative result</em></td>
* <td valign="top"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br>
* &nbsp;&nbsp;&nbsp;&nbsp;'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td>
* </tr>
* <tr>
* <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>

View file

@ -1185,23 +1185,23 @@ static const UChar UNESCAPE_MAP[] = {
enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
/* Convert one octal digit to a numeric value 0..7, or -1 on failure */
static int8_t _digit8(UChar c) {
if (c >= 0x0030 && c <= 0x0037) {
return (int8_t)(c - 0x0030);
static int32_t _digit8(UChar c) {
if (c >= u'0' && c <= u'7') {
return c - u'0';
}
return -1;
}
/* Convert one hex digit to a numeric value 0..F, or -1 on failure */
static int8_t _digit16(UChar c) {
if (c >= 0x0030 && c <= 0x0039) {
return (int8_t)(c - 0x0030);
static int32_t _digit16(UChar c) {
if (c >= u'0' && c <= u'9') {
return c - u'0';
}
if (c >= 0x0041 && c <= 0x0046) {
return (int8_t)(c - (0x0041 - 10));
if (c >= u'A' && c <= u'F') {
return c - (u'A' - 10);
}
if (c >= 0x0061 && c <= 0x0066) {
return (int8_t)(c - (0x0061 - 10));
if (c >= u'a' && c <= u'f') {
return c - (u'a' - 10);
}
return -1;
}
@ -1216,14 +1216,13 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
void *context) {
int32_t start = *offset;
UChar c;
UChar32 c;
UChar32 result = 0;
int8_t n = 0;
int8_t minDig = 0;
int8_t maxDig = 0;
int8_t bitsPerDigit = 4;
int8_t dig;
int32_t i;
int32_t dig;
UBool braces = FALSE;
/* Check that offset is in range */
@ -1236,15 +1235,15 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
/* Convert hexadecimal and octal escapes */
switch (c) {
case 0x0075 /*'u'*/:
case u'u':
minDig = maxDig = 4;
break;
case 0x0055 /*'U'*/:
case u'U':
minDig = maxDig = 8;
break;
case 0x0078 /*'x'*/:
case u'x':
minDig = 1;
if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
if (*offset < length && charAt(*offset, context) == u'{') {
++(*offset);
braces = TRUE;
maxDig = 8;
@ -1266,7 +1265,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
if (minDig != 0) {
while (*offset < length && n < maxDig) {
c = charAt(*offset, context);
dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c);
if (dig < 0) {
break;
}
@ -1278,7 +1277,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
goto err;
}
if (braces) {
if (c != 0x7D /*}*/) {
if (c != u'}') {
goto err;
}
++(*offset);
@ -1293,16 +1292,15 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
if (*offset < length && U16_IS_LEAD(result)) {
int32_t ahead = *offset + 1;
c = charAt(*offset, context);
if (c == 0x5C /*'\\'*/ && ahead < length) {
// Calling u_unescapeAt recursively may cause a stack overflow if
// we have repeated surrogate lead after that. Limit the
// length to 5 ('u' and 4 hex) after ahead.
int32_t tailLimit = ahead + 5;
if (c == u'\\' && ahead < length) {
// Calling ourselves recursively may cause a stack overflow if
// we have repeated escaped lead surrogates.
// Limit the length to 11 ("x{0000DFFF}") after ahead.
int32_t tailLimit = ahead + 11;
if (tailLimit > length) {
tailLimit = length;
}
c = (UChar) u_unescapeAt(charAt, &ahead, tailLimit,
context);
c = u_unescapeAt(charAt, &ahead, tailLimit, context);
}
if (U16_IS_TRAIL(c)) {
*offset = ahead;
@ -1313,7 +1311,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
}
/* Convert C-style escapes in table */
for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
for (int32_t i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
if (c == UNESCAPE_MAP[i]) {
return UNESCAPE_MAP[i+1];
} else if (c < UNESCAPE_MAP[i]) {
@ -1322,13 +1320,13 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
}
/* Map \cX to control-X: X & 0x1F */
if (c == 0x0063 /*'c'*/ && *offset < length) {
if (c == u'c' && *offset < length) {
c = charAt((*offset)++, context);
if (U16_IS_LEAD(c) && *offset < length) {
UChar c2 = charAt(*offset, context);
if (U16_IS_TRAIL(c2)) {
++(*offset);
c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */
c = U16_GET_SUPPLEMENTARY(c, c2);
}
}
return 0x1F & c;

View file

@ -1345,6 +1345,12 @@ void UnicodeStringTest::TestUnescape(void) {
if (!UNICODE_STRING("wrong \\u sequence", 17).unescape().isEmpty()) {
errln("FAIL: unescaping of a string with an illegal escape sequence did not return an empty string");
}
// ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits
UnicodeString euro = UnicodeString(u"\\u20aC").unescape();
assertEquals("ASCII Euro", u"", euro);
UnicodeString nonASCIIEuro = UnicodeString(u"\\u").unescape();
assertTrue("unescape() accepted non-ASCII digits", nonASCIIEuro.isEmpty());
}
/* test code point counting functions --------------------------------------- */

View file

@ -777,6 +777,28 @@ public final class Utility {
/*v*/ 0x76, 0x0b
};
/* Convert one octal digit to a numeric value 0..7, or -1 on failure */
private static final int _digit8(int c) {
if (c >= '0' && c <= '7') {
return c - '0';
}
return -1;
}
/* Convert one hex digit to a numeric value 0..F, or -1 on failure */
private static final int _digit16(int c) {
if (c >= '0' && c <= '9') {
return c - '0';
}
if (c >= 'A' && c <= 'F') {
return c - ('A' - 10);
}
if (c >= 'a' && c <= 'f') {
return c - ('a' - 10);
}
return -1;
}
/**
* Converts an escape to a code point value. We attempt
* to parallel the icu4c unescapeAt() function.
@ -788,26 +810,26 @@ public final class Utility {
* @return the code point and length, or -1 on error.
*/
public static int unescapeAndLengthAt(CharSequence s, int offset) {
int c;
return unescapeAndLengthAt(s, offset, s.length());
}
private static int unescapeAndLengthAt(CharSequence s, int offset, int length) {
int result = 0;
int n = 0;
int minDig = 0;
int maxDig = 0;
int bitsPerDigit = 4;
int dig;
int i;
boolean braces = false;
/* Check that offset is in range */
int length = s.length();
if (offset < 0 || offset >= length) {
return -1;
}
int start = offset;
/* Fetch first UChar after '\\' */
c = Character.codePointAt(s, offset);
offset += UTF16.getCharCount(c);
int c = s.charAt(offset++);
/* Convert hexadecimal and octal escapes */
switch (c) {
@ -819,7 +841,7 @@ public final class Utility {
break;
case 'x':
minDig = 1;
if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
if (offset < length && s.charAt(offset) == '{') {
++offset;
braces = true;
maxDig = 8;
@ -828,7 +850,7 @@ public final class Utility {
}
break;
default:
dig = UCharacter.digit(c, 8);
dig = _digit8(c);
if (dig >= 0) {
minDig = 1;
maxDig = 3;
@ -840,20 +862,20 @@ public final class Utility {
}
if (minDig != 0) {
while (offset < length && n < maxDig) {
c = UTF16.charAt(s, offset);
dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
c = s.charAt(offset);
dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c);
if (dig < 0) {
break;
}
result = (result << bitsPerDigit) | dig;
offset += UTF16.getCharCount(c);
++offset;
++n;
}
if (n < minDig) {
return -1;
}
if (braces) {
if (c != 0x7D /*}*/) {
if (c != '}') {
return -1;
}
++offset;
@ -867,9 +889,16 @@ public final class Utility {
// supplementary.
if (offset < length && UTF16.isLeadSurrogate(result)) {
int ahead = offset+1;
c = s.charAt(offset); // [sic] get 16-bit code unit
c = s.charAt(offset);
if (c == '\\' && ahead < length) {
int cpAndLength = unescapeAndLengthAt(s, ahead);
// Calling ourselves recursively may cause a stack overflow if
// we have repeated escaped lead surrogates.
// Limit the length to 11 ("x{0000DFFF}") after ahead.
int tailLimit = ahead + 11;
if (tailLimit > length) {
tailLimit = length;
}
int cpAndLength = unescapeAndLengthAt(s, ahead, tailLimit);
if (cpAndLength >= 0) {
c = cpAndLength >> 8;
ahead += cpAndLength & 0xff;
@ -877,14 +906,14 @@ public final class Utility {
}
if (UTF16.isTrailSurrogate(c)) {
offset = ahead;
result = Character.toCodePoint((char) result, (char) c);
result = UCharacter.toCodePoint(result, c);
}
}
return codePointAndLength(result, start, offset);
}
/* Convert C-style escapes in table */
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
for (int i=0; i<UNESCAPE_MAP.length; i+=2) {
if (c == UNESCAPE_MAP[i]) {
return codePointAndLength(UNESCAPE_MAP[i+1], start, offset);
} else if (c < UNESCAPE_MAP[i]) {
@ -894,12 +923,20 @@ public final class Utility {
/* Map \cX to control-X: X & 0x1F */
if (c == 'c' && offset < length) {
c = UTF16.charAt(s, offset);
return codePointAndLength(c & 0x1F, start, offset + UTF16.getCharCount(c));
c = Character.codePointAt(s, offset);
return codePointAndLength(c & 0x1F, start, offset + Character.charCount(c));
}
/* If no special forms are recognized, then consider
* the backslash to generically escape the next character. */
* the backslash to generically escape the next character.
* Deal with surrogate pairs. */
if (UTF16.isLeadSurrogate(c) && offset < length) {
int c2 = s.charAt(offset);
if (UTF16.isTrailSurrogate(c2)) {
++offset;
c = UCharacter.toCodePoint(c, c2);
}
}
return codePointAndLength(c, start, offset);
}

View file

@ -225,9 +225,8 @@ import com.ibm.icu.util.VersionInfo;
* </tr>
* <tr style="vertical-align: top">
* <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex :=&nbsp; </code></td>
* <td style="vertical-align: top;"><em>any character for which
* </em><code>Character.digit(c, 16)</code><em>
* returns a non-negative result</em></td>
* <td style="vertical-align: top;"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br>
* &nbsp;&nbsp;&nbsp;&nbsp;'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td>
* </tr>
* <tr>
* <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property :=&nbsp; </code></td>

View file

@ -63,6 +63,17 @@ public class UtilityTest extends TestFmwk {
assertTrue(pattern + " contains U+1DA8B", set.contains(0x1DA8B));
assertTrue(pattern + " contains U+1DF00..U+1DF1E", set.contains(0x1DF00, 0x1DF1E));
assertFalse(pattern + " contains U+1DF1F", set.contains(0x1DF1F));
// ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits
String euro = Utility.unescape("\\u20aC");
assertEquals("ASCII Euro", "", euro);
try {
Utility.unescape("\\u");
fail("unescape() accepted non-ASCII digits");
} catch(IllegalArgumentException expected) {
}
String euro2 = Utility.unescapeLeniently("\\u20aC\\u");
assertEquals("lenient", "\\u", euro2);
}
@Test