ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits

2025-04-13 00:43:32 +00:00 · 2021-08-24 12:42:40 -07:00 · 2021-08-24 12:42:40 -07:00 · 486e2d36ac
commit 486e2d36ac
parent ae3f2ae7ca
6 changed files with 103 additions and 53 deletions
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@ -217,9 +217,8 @@ class RuleCharacterIterator;
 *     </tr>
 *     <tr align="top">
 *       <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
- *       <td valign="top"><em>any character for which
- *       </em><code>Character.digit(c, 16)</code><em>
- *       returns a non-negative result</em></td>
+ *       <td valign="top"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br>
+ *       &nbsp;&nbsp;&nbsp;&nbsp;'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td>
 *     </tr>
 *     <tr>
 *       <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
--- a/icu4c/source/common/ustring.cpp
+++ b/icu4c/source/common/ustring.cpp
@ -1185,23 +1185,23 @@ static const UChar UNESCAPE_MAP[] = {
 enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };

 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
-static int8_t _digit8(UChar c) {
-    if (c >= 0x0030 && c <= 0x0037) {
-        return (int8_t)(c - 0x0030);
+static int32_t _digit8(UChar c) {
+    if (c >= u'0' && c <= u'7') {
+        return c - u'0';
    }
    return -1;
 }

 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
-static int8_t _digit16(UChar c) {
-    if (c >= 0x0030 && c <= 0x0039) {
-        return (int8_t)(c - 0x0030);
+static int32_t _digit16(UChar c) {
+    if (c >= u'0' && c <= u'9') {
+        return c - u'0';
    }
-    if (c >= 0x0041 && c <= 0x0046) {
-        return (int8_t)(c - (0x0041 - 10));
+    if (c >= u'A' && c <= u'F') {
+        return c - (u'A' - 10);
    }
-    if (c >= 0x0061 && c <= 0x0066) {
-        return (int8_t)(c - (0x0061 - 10));
+    if (c >= u'a' && c <= u'f') {
+        return c - (u'a' - 10);
    }
    return -1;
 }
@ -1216,14 +1216,13 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
             void *context) {

    int32_t start = *offset;
-    UChar c;
+    UChar32 c;
    UChar32 result = 0;
    int8_t n = 0;
    int8_t minDig = 0;
    int8_t maxDig = 0;
    int8_t bitsPerDigit = 4; 
-    int8_t dig;
-    int32_t i;
+    int32_t dig;
    UBool braces = FALSE;

    /* Check that offset is in range */
@ -1236,15 +1235,15 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,

    /* Convert hexadecimal and octal escapes */
    switch (c) {
-    case 0x0075 /*'u'*/:
+    case u'u':
        minDig = maxDig = 4;
        break;
-    case 0x0055 /*'U'*/:
+    case u'U':
        minDig = maxDig = 8;
        break;
-    case 0x0078 /*'x'*/:
+    case u'x':
        minDig = 1;
-        if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
+        if (*offset < length && charAt(*offset, context) == u'{') {
            ++(*offset);
            braces = TRUE;
            maxDig = 8;
@ -1266,7 +1265,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
    if (minDig != 0) {
        while (*offset < length && n < maxDig) {
            c = charAt(*offset, context);
-            dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
+            dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c);
            if (dig < 0) {
                break;
            }
@ -1278,7 +1277,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
            goto err;
        }
        if (braces) {
-            if (c != 0x7D /*}*/) {
+            if (c != u'}') {
                goto err;
            }
            ++(*offset);
@ -1293,16 +1292,15 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
        if (*offset < length && U16_IS_LEAD(result)) {
            int32_t ahead = *offset + 1;
            c = charAt(*offset, context);
-            if (c == 0x5C /*'\\'*/ && ahead < length) {
-                // Calling u_unescapeAt recursively may cause a stack overflow if
-                // we have repeated surrogate lead after that. Limit the
-                // length to 5 ('u' and 4 hex) after ahead.
-                int32_t tailLimit = ahead + 5;
+            if (c == u'\\' && ahead < length) {
+                // Calling ourselves recursively may cause a stack overflow if
+                // we have repeated escaped lead surrogates.
+                // Limit the length to 11 ("x{0000DFFF}") after ahead.
+                int32_t tailLimit = ahead + 11;
                if (tailLimit > length) {
                    tailLimit = length;
                }
-                c = (UChar) u_unescapeAt(charAt, &ahead, tailLimit,
-                                         context);
+                c = u_unescapeAt(charAt, &ahead, tailLimit, context);
            }
            if (U16_IS_TRAIL(c)) {
                *offset = ahead;
@ -1313,7 +1311,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
    }

    /* Convert C-style escapes in table */
-    for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
+    for (int32_t i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
        if (c == UNESCAPE_MAP[i]) {
            return UNESCAPE_MAP[i+1];
        } else if (c < UNESCAPE_MAP[i]) {
@ -1322,13 +1320,13 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
    }

    /* Map \cX to control-X: X & 0x1F */
-    if (c == 0x0063 /*'c'*/ && *offset < length) {
+    if (c == u'c' && *offset < length) {
        c = charAt((*offset)++, context);
        if (U16_IS_LEAD(c) && *offset < length) {
            UChar c2 = charAt(*offset, context);
            if (U16_IS_TRAIL(c2)) {
                ++(*offset);
-                c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */
+                c = U16_GET_SUPPLEMENTARY(c, c2);
            }
        }
        return 0x1F & c;
--- a/icu4c/source/test/intltest/ustrtest.cpp
+++ b/icu4c/source/test/intltest/ustrtest.cpp
@ -1345,6 +1345,12 @@ void UnicodeStringTest::TestUnescape(void) {
    if (!UNICODE_STRING("wrong \\u sequence", 17).unescape().isEmpty()) {
        errln("FAIL: unescaping of a string with an illegal escape sequence did not return an empty string");
    }
+
+    // ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits
+    UnicodeString euro = UnicodeString(u"\\u20aC").unescape();
+    assertEquals("ASCII Euro", u"€", euro);
+    UnicodeString nonASCIIEuro = UnicodeString(u"\\u୨෦ａＣ").unescape();
+    assertTrue("unescape() accepted non-ASCII digits", nonASCIIEuro.isEmpty());
 }

 /* test code point counting functions --------------------------------------- */
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
@ -777,6 +777,28 @@ public final class Utility {
        /*v*/ 0x76, 0x0b
    };

+    /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
+    private static final int _digit8(int c) {
+        if (c >= '0' && c <= '7') {
+            return c - '0';
+        }
+        return -1;
+    }
+
+    /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
+    private static final int _digit16(int c) {
+        if (c >= '0' && c <= '9') {
+            return c - '0';
+        }
+        if (c >= 'A' && c <= 'F') {
+            return c - ('A' - 10);
+        }
+        if (c >= 'a' && c <= 'f') {
+            return c - ('a' - 10);
+        }
+        return -1;
+    }
+
    /**
     * Converts an escape to a code point value. We attempt
     * to parallel the icu4c unescapeAt() function.
@ -788,26 +810,26 @@ public final class Utility {
     * @return the code point and length, or -1 on error.
     */
    public static int unescapeAndLengthAt(CharSequence s, int offset) {
-        int c;
+        return unescapeAndLengthAt(s, offset, s.length());
+    }
+
+    private static int unescapeAndLengthAt(CharSequence s, int offset, int length) {
        int result = 0;
        int n = 0;
        int minDig = 0;
        int maxDig = 0;
        int bitsPerDigit = 4;
        int dig;
-        int i;
        boolean braces = false;

        /* Check that offset is in range */
-        int length = s.length();
        if (offset < 0 || offset >= length) {
            return -1;
        }
        int start = offset;

        /* Fetch first UChar after '\\' */
-        c = Character.codePointAt(s, offset);
-        offset += UTF16.getCharCount(c);
+        int c = s.charAt(offset++);

        /* Convert hexadecimal and octal escapes */
        switch (c) {
@ -819,7 +841,7 @@ public final class Utility {
            break;
        case 'x':
            minDig = 1;
-            if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
+            if (offset < length && s.charAt(offset) == '{') {
                ++offset;
                braces = true;
                maxDig = 8;
@ -828,7 +850,7 @@ public final class Utility {
            }
            break;
        default:
-            dig = UCharacter.digit(c, 8);
+            dig = _digit8(c);
            if (dig >= 0) {
                minDig = 1;
                maxDig = 3;
@ -840,20 +862,20 @@ public final class Utility {
        }
        if (minDig != 0) {
            while (offset < length && n < maxDig) {
-                c = UTF16.charAt(s, offset);
-                dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
+                c = s.charAt(offset);
+                dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c);
                if (dig < 0) {
                    break;
                }
                result = (result << bitsPerDigit) | dig;
-                offset += UTF16.getCharCount(c);
+                ++offset;
                ++n;
            }
            if (n < minDig) {
                return -1;
            }
            if (braces) {
-                if (c != 0x7D /*}*/) {
+                if (c != '}') {
                    return -1;
                }
                ++offset;
@ -867,9 +889,16 @@ public final class Utility {
            // supplementary.
            if (offset < length && UTF16.isLeadSurrogate(result)) {
                int ahead = offset+1;
-                c = s.charAt(offset); // [sic] get 16-bit code unit
+                c = s.charAt(offset);
                if (c == '\\' && ahead < length) {
-                    int cpAndLength = unescapeAndLengthAt(s, ahead);
+                    // Calling ourselves recursively may cause a stack overflow if
+                    // we have repeated escaped lead surrogates.
+                    // Limit the length to 11 ("x{0000DFFF}") after ahead.
+                    int tailLimit = ahead + 11;
+                    if (tailLimit > length) {
+                        tailLimit = length;
+                    }
+                    int cpAndLength = unescapeAndLengthAt(s, ahead, tailLimit);
                    if (cpAndLength >= 0) {
                        c = cpAndLength >> 8;
                        ahead += cpAndLength & 0xff;
@ -877,14 +906,14 @@ public final class Utility {
                }
                if (UTF16.isTrailSurrogate(c)) {
                    offset = ahead;
-                    result = Character.toCodePoint((char) result, (char) c);
+                    result = UCharacter.toCodePoint(result, c);
                }
            }
            return codePointAndLength(result, start, offset);
        }

        /* Convert C-style escapes in table */
-        for (i=0; i<UNESCAPE_MAP.length; i+=2) {
+        for (int i=0; i<UNESCAPE_MAP.length; i+=2) {
            if (c == UNESCAPE_MAP[i]) {
                return codePointAndLength(UNESCAPE_MAP[i+1], start, offset);
            } else if (c < UNESCAPE_MAP[i]) {
@ -894,12 +923,20 @@ public final class Utility {

        /* Map \cX to control-X: X & 0x1F */
        if (c == 'c' && offset < length) {
-            c = UTF16.charAt(s, offset);
-            return codePointAndLength(c & 0x1F, start, offset + UTF16.getCharCount(c));
+            c = Character.codePointAt(s, offset);
+            return codePointAndLength(c & 0x1F, start, offset + Character.charCount(c));
        }

        /* If no special forms are recognized, then consider
-         * the backslash to generically escape the next character. */
+         * the backslash to generically escape the next character.
+         * Deal with surrogate pairs. */
+        if (UTF16.isLeadSurrogate(c) && offset < length) {
+            int c2 = s.charAt(offset);
+            if (UTF16.isTrailSurrogate(c2)) {
+                ++offset;
+                c = UCharacter.toCodePoint(c, c2);
+            }
+        }
        return codePointAndLength(c, start, offset);
    }

--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@ -225,9 +225,8 @@ import com.ibm.icu.util.VersionInfo;
 *     </tr>
 *     <tr style="vertical-align: top">
 *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex :=&nbsp; </code></td>
- *       <td style="vertical-align: top;"><em>any character for which
- *       </em><code>Character.digit(c, 16)</code><em>
- *       returns a non-negative result</em></td>
+ *       <td style="vertical-align: top;"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br>
+ *       &nbsp;&nbsp;&nbsp;&nbsp;'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td>
 *     </tr>
 *     <tr>
 *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property :=&nbsp; </code></td>
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
@ -63,6 +63,17 @@ public class UtilityTest extends TestFmwk {
        assertTrue(pattern + " contains U+1DA8B", set.contains(0x1DA8B));
        assertTrue(pattern + " contains U+1DF00..U+1DF1E", set.contains(0x1DF00, 0x1DF1E));
        assertFalse(pattern + " contains U+1DF1F", set.contains(0x1DF1F));
+
+        // ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits
+        String euro = Utility.unescape("\\u20aC");
+        assertEquals("ASCII Euro", "€", euro);
+        try {
+            Utility.unescape("\\u୨෦ａＣ");
+            fail("unescape() accepted non-ASCII digits");
+        } catch(IllegalArgumentException expected) {
+        }
+        String euro2 = Utility.unescapeLeniently("\\u20aC\\u୨෦ａＣ");
+        assertEquals("lenient", "€\\u୨෦ａＣ", euro2);
    }

    @Test