From 486e2d36ac49cddd87d81d77db4e87d0ea4bfae4 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Tue, 24 Aug 2021 12:42:40 -0700 Subject: [PATCH] ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits --- icu4c/source/common/unicode/uniset.h | 5 +- icu4c/source/common/ustring.cpp | 56 +++++++------- icu4c/source/test/intltest/ustrtest.cpp | 6 ++ .../core/src/com/ibm/icu/impl/Utility.java | 73 ++++++++++++++----- .../core/src/com/ibm/icu/text/UnicodeSet.java | 5 +- .../ibm/icu/dev/test/util/UtilityTest.java | 11 +++ 6 files changed, 103 insertions(+), 53 deletions(-) diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index d6fa1807de1..dc3e0b24594 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -217,9 +217,8 @@ class RuleCharacterIterator; * * * hex :=  - * any character for which - * Character.digit(c, 16) - * returns a non-negative result + * '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |
+ *     'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'
* * * property :=  diff --git a/icu4c/source/common/ustring.cpp b/icu4c/source/common/ustring.cpp index d7f63ec77fe..84772563891 100644 --- a/icu4c/source/common/ustring.cpp +++ b/icu4c/source/common/ustring.cpp @@ -1185,23 +1185,23 @@ static const UChar UNESCAPE_MAP[] = { enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) }; /* Convert one octal digit to a numeric value 0..7, or -1 on failure */ -static int8_t _digit8(UChar c) { - if (c >= 0x0030 && c <= 0x0037) { - return (int8_t)(c - 0x0030); +static int32_t _digit8(UChar c) { + if (c >= u'0' && c <= u'7') { + return c - u'0'; } return -1; } /* Convert one hex digit to a numeric value 0..F, or -1 on failure */ -static int8_t _digit16(UChar c) { - if (c >= 0x0030 && c <= 0x0039) { - return (int8_t)(c - 0x0030); +static int32_t _digit16(UChar c) { + if (c >= u'0' && c <= u'9') { + return c - u'0'; } - if (c >= 0x0041 && c <= 0x0046) { - return (int8_t)(c - (0x0041 - 10)); + if (c >= u'A' && c <= u'F') { + return c - (u'A' - 10); } - if (c >= 0x0061 && c <= 0x0066) { - return (int8_t)(c - (0x0061 - 10)); + if (c >= u'a' && c <= u'f') { + return c - (u'a' - 10); } return -1; } @@ -1216,14 +1216,13 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt, void *context) { int32_t start = *offset; - UChar c; + UChar32 c; UChar32 result = 0; int8_t n = 0; int8_t minDig = 0; int8_t maxDig = 0; int8_t bitsPerDigit = 4; - int8_t dig; - int32_t i; + int32_t dig; UBool braces = FALSE; /* Check that offset is in range */ @@ -1236,15 +1235,15 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt, /* Convert hexadecimal and octal escapes */ switch (c) { - case 0x0075 /*'u'*/: + case u'u': minDig = maxDig = 4; break; - case 0x0055 /*'U'*/: + case u'U': minDig = maxDig = 8; break; - case 0x0078 /*'x'*/: + case u'x': minDig = 1; - if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) { + if (*offset < length && charAt(*offset, context) == u'{') { ++(*offset); braces = TRUE; maxDig = 8; @@ -1266,7 +1265,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt, if (minDig != 0) { while (*offset < length && n < maxDig) { c = charAt(*offset, context); - dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c)); + dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c); if (dig < 0) { break; } @@ -1278,7 +1277,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt, goto err; } if (braces) { - if (c != 0x7D /*}*/) { + if (c != u'}') { goto err; } ++(*offset); @@ -1293,16 +1292,15 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt, if (*offset < length && U16_IS_LEAD(result)) { int32_t ahead = *offset + 1; c = charAt(*offset, context); - if (c == 0x5C /*'\\'*/ && ahead < length) { - // Calling u_unescapeAt recursively may cause a stack overflow if - // we have repeated surrogate lead after that. Limit the - // length to 5 ('u' and 4 hex) after ahead. - int32_t tailLimit = ahead + 5; + if (c == u'\\' && ahead < length) { + // Calling ourselves recursively may cause a stack overflow if + // we have repeated escaped lead surrogates. + // Limit the length to 11 ("x{0000DFFF}") after ahead. + int32_t tailLimit = ahead + 11; if (tailLimit > length) { tailLimit = length; } - c = (UChar) u_unescapeAt(charAt, &ahead, tailLimit, - context); + c = u_unescapeAt(charAt, &ahead, tailLimit, context); } if (U16_IS_TRAIL(c)) { *offset = ahead; @@ -1313,7 +1311,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt, } /* Convert C-style escapes in table */ - for (i=0; i= '0' && c <= '7') { + return c - '0'; + } + return -1; + } + + /* Convert one hex digit to a numeric value 0..F, or -1 on failure */ + private static final int _digit16(int c) { + if (c >= '0' && c <= '9') { + return c - '0'; + } + if (c >= 'A' && c <= 'F') { + return c - ('A' - 10); + } + if (c >= 'a' && c <= 'f') { + return c - ('a' - 10); + } + return -1; + } + /** * Converts an escape to a code point value. We attempt * to parallel the icu4c unescapeAt() function. @@ -788,26 +810,26 @@ public final class Utility { * @return the code point and length, or -1 on error. */ public static int unescapeAndLengthAt(CharSequence s, int offset) { - int c; + return unescapeAndLengthAt(s, offset, s.length()); + } + + private static int unescapeAndLengthAt(CharSequence s, int offset, int length) { int result = 0; int n = 0; int minDig = 0; int maxDig = 0; int bitsPerDigit = 4; int dig; - int i; boolean braces = false; /* Check that offset is in range */ - int length = s.length(); if (offset < 0 || offset >= length) { return -1; } int start = offset; /* Fetch first UChar after '\\' */ - c = Character.codePointAt(s, offset); - offset += UTF16.getCharCount(c); + int c = s.charAt(offset++); /* Convert hexadecimal and octal escapes */ switch (c) { @@ -819,7 +841,7 @@ public final class Utility { break; case 'x': minDig = 1; - if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { + if (offset < length && s.charAt(offset) == '{') { ++offset; braces = true; maxDig = 8; @@ -828,7 +850,7 @@ public final class Utility { } break; default: - dig = UCharacter.digit(c, 8); + dig = _digit8(c); if (dig >= 0) { minDig = 1; maxDig = 3; @@ -840,20 +862,20 @@ public final class Utility { } if (minDig != 0) { while (offset < length && n < maxDig) { - c = UTF16.charAt(s, offset); - dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); + c = s.charAt(offset); + dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c); if (dig < 0) { break; } result = (result << bitsPerDigit) | dig; - offset += UTF16.getCharCount(c); + ++offset; ++n; } if (n < minDig) { return -1; } if (braces) { - if (c != 0x7D /*}*/) { + if (c != '}') { return -1; } ++offset; @@ -867,9 +889,16 @@ public final class Utility { // supplementary. if (offset < length && UTF16.isLeadSurrogate(result)) { int ahead = offset+1; - c = s.charAt(offset); // [sic] get 16-bit code unit + c = s.charAt(offset); if (c == '\\' && ahead < length) { - int cpAndLength = unescapeAndLengthAt(s, ahead); + // Calling ourselves recursively may cause a stack overflow if + // we have repeated escaped lead surrogates. + // Limit the length to 11 ("x{0000DFFF}") after ahead. + int tailLimit = ahead + 11; + if (tailLimit > length) { + tailLimit = length; + } + int cpAndLength = unescapeAndLengthAt(s, ahead, tailLimit); if (cpAndLength >= 0) { c = cpAndLength >> 8; ahead += cpAndLength & 0xff; @@ -877,14 +906,14 @@ public final class Utility { } if (UTF16.isTrailSurrogate(c)) { offset = ahead; - result = Character.toCodePoint((char) result, (char) c); + result = UCharacter.toCodePoint(result, c); } } return codePointAndLength(result, start, offset); } /* Convert C-style escapes in table */ - for (i=0; i * * hex :=  - * any character for which - * Character.digit(c, 16) - * returns a non-negative result + * '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |
+ *     'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'
* * * property :=  diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java index 15d1e601049..ddc8b0d9aea 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java @@ -63,6 +63,17 @@ public class UtilityTest extends TestFmwk { assertTrue(pattern + " contains U+1DA8B", set.contains(0x1DA8B)); assertTrue(pattern + " contains U+1DF00..U+1DF1E", set.contains(0x1DF00, 0x1DF1E)); assertFalse(pattern + " contains U+1DF1F", set.contains(0x1DF1F)); + + // ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits + String euro = Utility.unescape("\\u20aC"); + assertEquals("ASCII Euro", "€", euro); + try { + Utility.unescape("\\u୨෦aC"); + fail("unescape() accepted non-ASCII digits"); + } catch(IllegalArgumentException expected) { + } + String euro2 = Utility.unescapeLeniently("\\u20aC\\u୨෦aC"); + assertEquals("lenient", "€\\u୨෦aC", euro2); } @Test