mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 00:43:32 +00:00
ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits
This commit is contained in:
parent
ae3f2ae7ca
commit
486e2d36ac
6 changed files with 103 additions and 53 deletions
icu4c/source
icu4j/main
classes/core/src/com/ibm/icu
tests/core/src/com/ibm/icu/dev/test/util
|
@ -217,9 +217,8 @@ class RuleCharacterIterator;
|
|||
* </tr>
|
||||
* <tr align="top">
|
||||
* <td nowrap valign="top" align="right"><code>hex := </code></td>
|
||||
* <td valign="top"><em>any character for which
|
||||
* </em><code>Character.digit(c, 16)</code><em>
|
||||
* returns a non-negative result</em></td>
|
||||
* <td valign="top"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br>
|
||||
* 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td nowrap valign="top" align="right"><code>property := </code></td>
|
||||
|
|
|
@ -1185,23 +1185,23 @@ static const UChar UNESCAPE_MAP[] = {
|
|||
enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
|
||||
|
||||
/* Convert one octal digit to a numeric value 0..7, or -1 on failure */
|
||||
static int8_t _digit8(UChar c) {
|
||||
if (c >= 0x0030 && c <= 0x0037) {
|
||||
return (int8_t)(c - 0x0030);
|
||||
static int32_t _digit8(UChar c) {
|
||||
if (c >= u'0' && c <= u'7') {
|
||||
return c - u'0';
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Convert one hex digit to a numeric value 0..F, or -1 on failure */
|
||||
static int8_t _digit16(UChar c) {
|
||||
if (c >= 0x0030 && c <= 0x0039) {
|
||||
return (int8_t)(c - 0x0030);
|
||||
static int32_t _digit16(UChar c) {
|
||||
if (c >= u'0' && c <= u'9') {
|
||||
return c - u'0';
|
||||
}
|
||||
if (c >= 0x0041 && c <= 0x0046) {
|
||||
return (int8_t)(c - (0x0041 - 10));
|
||||
if (c >= u'A' && c <= u'F') {
|
||||
return c - (u'A' - 10);
|
||||
}
|
||||
if (c >= 0x0061 && c <= 0x0066) {
|
||||
return (int8_t)(c - (0x0061 - 10));
|
||||
if (c >= u'a' && c <= u'f') {
|
||||
return c - (u'a' - 10);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
@ -1216,14 +1216,13 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
|
|||
void *context) {
|
||||
|
||||
int32_t start = *offset;
|
||||
UChar c;
|
||||
UChar32 c;
|
||||
UChar32 result = 0;
|
||||
int8_t n = 0;
|
||||
int8_t minDig = 0;
|
||||
int8_t maxDig = 0;
|
||||
int8_t bitsPerDigit = 4;
|
||||
int8_t dig;
|
||||
int32_t i;
|
||||
int32_t dig;
|
||||
UBool braces = FALSE;
|
||||
|
||||
/* Check that offset is in range */
|
||||
|
@ -1236,15 +1235,15 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
|
|||
|
||||
/* Convert hexadecimal and octal escapes */
|
||||
switch (c) {
|
||||
case 0x0075 /*'u'*/:
|
||||
case u'u':
|
||||
minDig = maxDig = 4;
|
||||
break;
|
||||
case 0x0055 /*'U'*/:
|
||||
case u'U':
|
||||
minDig = maxDig = 8;
|
||||
break;
|
||||
case 0x0078 /*'x'*/:
|
||||
case u'x':
|
||||
minDig = 1;
|
||||
if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
|
||||
if (*offset < length && charAt(*offset, context) == u'{') {
|
||||
++(*offset);
|
||||
braces = TRUE;
|
||||
maxDig = 8;
|
||||
|
@ -1266,7 +1265,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
|
|||
if (minDig != 0) {
|
||||
while (*offset < length && n < maxDig) {
|
||||
c = charAt(*offset, context);
|
||||
dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
|
||||
dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c);
|
||||
if (dig < 0) {
|
||||
break;
|
||||
}
|
||||
|
@ -1278,7 +1277,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
|
|||
goto err;
|
||||
}
|
||||
if (braces) {
|
||||
if (c != 0x7D /*}*/) {
|
||||
if (c != u'}') {
|
||||
goto err;
|
||||
}
|
||||
++(*offset);
|
||||
|
@ -1293,16 +1292,15 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
|
|||
if (*offset < length && U16_IS_LEAD(result)) {
|
||||
int32_t ahead = *offset + 1;
|
||||
c = charAt(*offset, context);
|
||||
if (c == 0x5C /*'\\'*/ && ahead < length) {
|
||||
// Calling u_unescapeAt recursively may cause a stack overflow if
|
||||
// we have repeated surrogate lead after that. Limit the
|
||||
// length to 5 ('u' and 4 hex) after ahead.
|
||||
int32_t tailLimit = ahead + 5;
|
||||
if (c == u'\\' && ahead < length) {
|
||||
// Calling ourselves recursively may cause a stack overflow if
|
||||
// we have repeated escaped lead surrogates.
|
||||
// Limit the length to 11 ("x{0000DFFF}") after ahead.
|
||||
int32_t tailLimit = ahead + 11;
|
||||
if (tailLimit > length) {
|
||||
tailLimit = length;
|
||||
}
|
||||
c = (UChar) u_unescapeAt(charAt, &ahead, tailLimit,
|
||||
context);
|
||||
c = u_unescapeAt(charAt, &ahead, tailLimit, context);
|
||||
}
|
||||
if (U16_IS_TRAIL(c)) {
|
||||
*offset = ahead;
|
||||
|
@ -1313,7 +1311,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
|
|||
}
|
||||
|
||||
/* Convert C-style escapes in table */
|
||||
for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
|
||||
for (int32_t i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
|
||||
if (c == UNESCAPE_MAP[i]) {
|
||||
return UNESCAPE_MAP[i+1];
|
||||
} else if (c < UNESCAPE_MAP[i]) {
|
||||
|
@ -1322,13 +1320,13 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
|
|||
}
|
||||
|
||||
/* Map \cX to control-X: X & 0x1F */
|
||||
if (c == 0x0063 /*'c'*/ && *offset < length) {
|
||||
if (c == u'c' && *offset < length) {
|
||||
c = charAt((*offset)++, context);
|
||||
if (U16_IS_LEAD(c) && *offset < length) {
|
||||
UChar c2 = charAt(*offset, context);
|
||||
if (U16_IS_TRAIL(c2)) {
|
||||
++(*offset);
|
||||
c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */
|
||||
c = U16_GET_SUPPLEMENTARY(c, c2);
|
||||
}
|
||||
}
|
||||
return 0x1F & c;
|
||||
|
|
|
@ -1345,6 +1345,12 @@ void UnicodeStringTest::TestUnescape(void) {
|
|||
if (!UNICODE_STRING("wrong \\u sequence", 17).unescape().isEmpty()) {
|
||||
errln("FAIL: unescaping of a string with an illegal escape sequence did not return an empty string");
|
||||
}
|
||||
|
||||
// ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits
|
||||
UnicodeString euro = UnicodeString(u"\\u20aC").unescape();
|
||||
assertEquals("ASCII Euro", u"€", euro);
|
||||
UnicodeString nonASCIIEuro = UnicodeString(u"\\u୨෦aC").unescape();
|
||||
assertTrue("unescape() accepted non-ASCII digits", nonASCIIEuro.isEmpty());
|
||||
}
|
||||
|
||||
/* test code point counting functions --------------------------------------- */
|
||||
|
|
|
@ -777,6 +777,28 @@ public final class Utility {
|
|||
/*v*/ 0x76, 0x0b
|
||||
};
|
||||
|
||||
/* Convert one octal digit to a numeric value 0..7, or -1 on failure */
|
||||
private static final int _digit8(int c) {
|
||||
if (c >= '0' && c <= '7') {
|
||||
return c - '0';
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Convert one hex digit to a numeric value 0..F, or -1 on failure */
|
||||
private static final int _digit16(int c) {
|
||||
if (c >= '0' && c <= '9') {
|
||||
return c - '0';
|
||||
}
|
||||
if (c >= 'A' && c <= 'F') {
|
||||
return c - ('A' - 10);
|
||||
}
|
||||
if (c >= 'a' && c <= 'f') {
|
||||
return c - ('a' - 10);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts an escape to a code point value. We attempt
|
||||
* to parallel the icu4c unescapeAt() function.
|
||||
|
@ -788,26 +810,26 @@ public final class Utility {
|
|||
* @return the code point and length, or -1 on error.
|
||||
*/
|
||||
public static int unescapeAndLengthAt(CharSequence s, int offset) {
|
||||
int c;
|
||||
return unescapeAndLengthAt(s, offset, s.length());
|
||||
}
|
||||
|
||||
private static int unescapeAndLengthAt(CharSequence s, int offset, int length) {
|
||||
int result = 0;
|
||||
int n = 0;
|
||||
int minDig = 0;
|
||||
int maxDig = 0;
|
||||
int bitsPerDigit = 4;
|
||||
int dig;
|
||||
int i;
|
||||
boolean braces = false;
|
||||
|
||||
/* Check that offset is in range */
|
||||
int length = s.length();
|
||||
if (offset < 0 || offset >= length) {
|
||||
return -1;
|
||||
}
|
||||
int start = offset;
|
||||
|
||||
/* Fetch first UChar after '\\' */
|
||||
c = Character.codePointAt(s, offset);
|
||||
offset += UTF16.getCharCount(c);
|
||||
int c = s.charAt(offset++);
|
||||
|
||||
/* Convert hexadecimal and octal escapes */
|
||||
switch (c) {
|
||||
|
@ -819,7 +841,7 @@ public final class Utility {
|
|||
break;
|
||||
case 'x':
|
||||
minDig = 1;
|
||||
if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
|
||||
if (offset < length && s.charAt(offset) == '{') {
|
||||
++offset;
|
||||
braces = true;
|
||||
maxDig = 8;
|
||||
|
@ -828,7 +850,7 @@ public final class Utility {
|
|||
}
|
||||
break;
|
||||
default:
|
||||
dig = UCharacter.digit(c, 8);
|
||||
dig = _digit8(c);
|
||||
if (dig >= 0) {
|
||||
minDig = 1;
|
||||
maxDig = 3;
|
||||
|
@ -840,20 +862,20 @@ public final class Utility {
|
|||
}
|
||||
if (minDig != 0) {
|
||||
while (offset < length && n < maxDig) {
|
||||
c = UTF16.charAt(s, offset);
|
||||
dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
|
||||
c = s.charAt(offset);
|
||||
dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c);
|
||||
if (dig < 0) {
|
||||
break;
|
||||
}
|
||||
result = (result << bitsPerDigit) | dig;
|
||||
offset += UTF16.getCharCount(c);
|
||||
++offset;
|
||||
++n;
|
||||
}
|
||||
if (n < minDig) {
|
||||
return -1;
|
||||
}
|
||||
if (braces) {
|
||||
if (c != 0x7D /*}*/) {
|
||||
if (c != '}') {
|
||||
return -1;
|
||||
}
|
||||
++offset;
|
||||
|
@ -867,9 +889,16 @@ public final class Utility {
|
|||
// supplementary.
|
||||
if (offset < length && UTF16.isLeadSurrogate(result)) {
|
||||
int ahead = offset+1;
|
||||
c = s.charAt(offset); // [sic] get 16-bit code unit
|
||||
c = s.charAt(offset);
|
||||
if (c == '\\' && ahead < length) {
|
||||
int cpAndLength = unescapeAndLengthAt(s, ahead);
|
||||
// Calling ourselves recursively may cause a stack overflow if
|
||||
// we have repeated escaped lead surrogates.
|
||||
// Limit the length to 11 ("x{0000DFFF}") after ahead.
|
||||
int tailLimit = ahead + 11;
|
||||
if (tailLimit > length) {
|
||||
tailLimit = length;
|
||||
}
|
||||
int cpAndLength = unescapeAndLengthAt(s, ahead, tailLimit);
|
||||
if (cpAndLength >= 0) {
|
||||
c = cpAndLength >> 8;
|
||||
ahead += cpAndLength & 0xff;
|
||||
|
@ -877,14 +906,14 @@ public final class Utility {
|
|||
}
|
||||
if (UTF16.isTrailSurrogate(c)) {
|
||||
offset = ahead;
|
||||
result = Character.toCodePoint((char) result, (char) c);
|
||||
result = UCharacter.toCodePoint(result, c);
|
||||
}
|
||||
}
|
||||
return codePointAndLength(result, start, offset);
|
||||
}
|
||||
|
||||
/* Convert C-style escapes in table */
|
||||
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
|
||||
for (int i=0; i<UNESCAPE_MAP.length; i+=2) {
|
||||
if (c == UNESCAPE_MAP[i]) {
|
||||
return codePointAndLength(UNESCAPE_MAP[i+1], start, offset);
|
||||
} else if (c < UNESCAPE_MAP[i]) {
|
||||
|
@ -894,12 +923,20 @@ public final class Utility {
|
|||
|
||||
/* Map \cX to control-X: X & 0x1F */
|
||||
if (c == 'c' && offset < length) {
|
||||
c = UTF16.charAt(s, offset);
|
||||
return codePointAndLength(c & 0x1F, start, offset + UTF16.getCharCount(c));
|
||||
c = Character.codePointAt(s, offset);
|
||||
return codePointAndLength(c & 0x1F, start, offset + Character.charCount(c));
|
||||
}
|
||||
|
||||
/* If no special forms are recognized, then consider
|
||||
* the backslash to generically escape the next character. */
|
||||
* the backslash to generically escape the next character.
|
||||
* Deal with surrogate pairs. */
|
||||
if (UTF16.isLeadSurrogate(c) && offset < length) {
|
||||
int c2 = s.charAt(offset);
|
||||
if (UTF16.isTrailSurrogate(c2)) {
|
||||
++offset;
|
||||
c = UCharacter.toCodePoint(c, c2);
|
||||
}
|
||||
}
|
||||
return codePointAndLength(c, start, offset);
|
||||
}
|
||||
|
||||
|
|
|
@ -225,9 +225,8 @@ import com.ibm.icu.util.VersionInfo;
|
|||
* </tr>
|
||||
* <tr style="vertical-align: top">
|
||||
* <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex := </code></td>
|
||||
* <td style="vertical-align: top;"><em>any character for which
|
||||
* </em><code>Character.digit(c, 16)</code><em>
|
||||
* returns a non-negative result</em></td>
|
||||
* <td style="vertical-align: top;"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br>
|
||||
* 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property := </code></td>
|
||||
|
|
|
@ -63,6 +63,17 @@ public class UtilityTest extends TestFmwk {
|
|||
assertTrue(pattern + " contains U+1DA8B", set.contains(0x1DA8B));
|
||||
assertTrue(pattern + " contains U+1DF00..U+1DF1E", set.contains(0x1DF00, 0x1DF1E));
|
||||
assertFalse(pattern + " contains U+1DF1F", set.contains(0x1DF1F));
|
||||
|
||||
// ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits
|
||||
String euro = Utility.unescape("\\u20aC");
|
||||
assertEquals("ASCII Euro", "€", euro);
|
||||
try {
|
||||
Utility.unescape("\\u୨෦aC");
|
||||
fail("unescape() accepted non-ASCII digits");
|
||||
} catch(IllegalArgumentException expected) {
|
||||
}
|
||||
String euro2 = Utility.unescapeLeniently("\\u20aC\\u୨෦aC");
|
||||
assertEquals("lenient", "€\\u୨෦aC", euro2);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
Loading…
Add table
Reference in a new issue