From 486e2d36ac49cddd87d81d77db4e87d0ea4bfae4 Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Tue, 24 Aug 2021 12:42:40 -0700
Subject: [PATCH] ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits

---
 icu4c/source/common/unicode/uniset.h          |  5 +-
 icu4c/source/common/ustring.cpp               | 56 +++++++-------
 icu4c/source/test/intltest/ustrtest.cpp       |  6 ++
 .../core/src/com/ibm/icu/impl/Utility.java    | 73 ++++++++++++++-----
 .../core/src/com/ibm/icu/text/UnicodeSet.java |  5 +-
 .../ibm/icu/dev/test/util/UtilityTest.java    | 11 +++
 6 files changed, 103 insertions(+), 53 deletions(-)
diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h
index d6fa1807de1..dc3e0b24594 100644
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -217,9 +217,8 @@ class RuleCharacterIterator;
  *     </tr>
  *     <tr align="top">
  *       <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
- *       <td valign="top"><em>any character for which
- *       </em><code>Character.digit(c, 16)</code><em>
- *       returns a non-negative result</em></td>
+ *       <td valign="top"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br>
+ *       &nbsp;&nbsp;&nbsp;&nbsp;'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td>
  *     </tr>
  *     <tr>
  *       <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
diff --git a/icu4c/source/common/ustring.cpp b/icu4c/source/common/ustring.cpp
index d7f63ec77fe..84772563891 100644
--- a/icu4c/source/common/ustring.cpp
+++ b/icu4c/source/common/ustring.cpp
@@ -1185,23 +1185,23 @@ static const UChar UNESCAPE_MAP[] = {
 enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
 
 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
-static int8_t _digit8(UChar c) {
-    if (c >= 0x0030 && c <= 0x0037) {
-        return (int8_t)(c - 0x0030);
+static int32_t _digit8(UChar c) {
+    if (c >= u'0' && c <= u'7') {
+        return c - u'0';
     }
     return -1;
 }
 
 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
-static int8_t _digit16(UChar c) {
-    if (c >= 0x0030 && c <= 0x0039) {
-        return (int8_t)(c - 0x0030);
+static int32_t _digit16(UChar c) {
+    if (c >= u'0' && c <= u'9') {
+        return c - u'0';
     }
-    if (c >= 0x0041 && c <= 0x0046) {
-        return (int8_t)(c - (0x0041 - 10));
+    if (c >= u'A' && c <= u'F') {
+        return c - (u'A' - 10);
     }
-    if (c >= 0x0061 && c <= 0x0066) {
-        return (int8_t)(c - (0x0061 - 10));
+    if (c >= u'a' && c <= u'f') {
+        return c - (u'a' - 10);
     }
     return -1;
 }
@@ -1216,14 +1216,13 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
              void *context) {
 
     int32_t start = *offset;
-    UChar c;
+    UChar32 c;
     UChar32 result = 0;
     int8_t n = 0;
     int8_t minDig = 0;
     int8_t maxDig = 0;
     int8_t bitsPerDigit = 4; 
-    int8_t dig;
-    int32_t i;
+    int32_t dig;
     UBool braces = FALSE;
 
     /* Check that offset is in range */
@@ -1236,15 +1235,15 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
 
     /* Convert hexadecimal and octal escapes */
     switch (c) {
-    case 0x0075 /*'u'*/:
+    case u'u':
         minDig = maxDig = 4;
         break;
-    case 0x0055 /*'U'*/:
+    case u'U':
         minDig = maxDig = 8;
         break;
-    case 0x0078 /*'x'*/:
+    case u'x':
         minDig = 1;
-        if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
+        if (*offset < length && charAt(*offset, context) == u'{') {
             ++(*offset);
             braces = TRUE;
             maxDig = 8;
@@ -1266,7 +1265,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
     if (minDig != 0) {
         while (*offset < length && n < maxDig) {
             c = charAt(*offset, context);
-            dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
+            dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c);
             if (dig < 0) {
                 break;
             }
@@ -1278,7 +1277,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
             goto err;
         }
         if (braces) {
-            if (c != 0x7D /*}*/) {
+            if (c != u'}') {
                 goto err;
             }
             ++(*offset);
@@ -1293,16 +1292,15 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
         if (*offset < length && U16_IS_LEAD(result)) {
             int32_t ahead = *offset + 1;
             c = charAt(*offset, context);
-            if (c == 0x5C /*'\\'*/ && ahead < length) {
-                // Calling u_unescapeAt recursively may cause a stack overflow if
-                // we have repeated surrogate lead after that. Limit the
-                // length to 5 ('u' and 4 hex) after ahead.
-                int32_t tailLimit = ahead + 5;
+            if (c == u'\\' && ahead < length) {
+                // Calling ourselves recursively may cause a stack overflow if
+                // we have repeated escaped lead surrogates.
+                // Limit the length to 11 ("x{0000DFFF}") after ahead.
+                int32_t tailLimit = ahead + 11;
                 if (tailLimit > length) {
                     tailLimit = length;
                 }
-                c = (UChar) u_unescapeAt(charAt, &ahead, tailLimit,
-                                         context);
+                c = u_unescapeAt(charAt, &ahead, tailLimit, context);
             }
             if (U16_IS_TRAIL(c)) {
                 *offset = ahead;
@@ -1313,7 +1311,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
     }
 
     /* Convert C-style escapes in table */
-    for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
+    for (int32_t i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
         if (c == UNESCAPE_MAP[i]) {
             return UNESCAPE_MAP[i+1];
         } else if (c < UNESCAPE_MAP[i]) {
@@ -1322,13 +1320,13 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt,
     }
 
     /* Map \cX to control-X: X & 0x1F */
-    if (c == 0x0063 /*'c'*/ && *offset < length) {
+    if (c == u'c' && *offset < length) {
         c = charAt((*offset)++, context);
         if (U16_IS_LEAD(c) && *offset < length) {
             UChar c2 = charAt(*offset, context);
             if (U16_IS_TRAIL(c2)) {
                 ++(*offset);
-                c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */
+                c = U16_GET_SUPPLEMENTARY(c, c2);
             }
         }
         return 0x1F & c;
diff --git a/icu4c/source/test/intltest/ustrtest.cpp b/icu4c/source/test/intltest/ustrtest.cpp
index 560f72a7ffc..b3af65791b8 100644
--- a/icu4c/source/test/intltest/ustrtest.cpp
+++ b/icu4c/source/test/intltest/ustrtest.cpp
@@ -1345,6 +1345,12 @@ void UnicodeStringTest::TestUnescape(void) {
     if (!UNICODE_STRING("wrong \\u sequence", 17).unescape().isEmpty()) {
         errln("FAIL: unescaping of a string with an illegal escape sequence did not return an empty string");
     }
+
+    // ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits
+    UnicodeString euro = UnicodeString(u"\\u20aC").unescape();
+    assertEquals("ASCII Euro", u"€", euro);
+    UnicodeString nonASCIIEuro = UnicodeString(u"\\u୨෦ａＣ").unescape();
+    assertTrue("unescape() accepted non-ASCII digits", nonASCIIEuro.isEmpty());
 }
 
 /* test code point counting functions --------------------------------------- */
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
index 59e44d6eb68..5308cad9eb9 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
@@ -777,6 +777,28 @@ public final class Utility {
         /*v*/ 0x76, 0x0b
     };
 
+    /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
+    private static final int _digit8(int c) {
+        if (c >= '0' && c <= '7') {
+            return c - '0';
+        }
+        return -1;
+    }
+
+    /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
+    private static final int _digit16(int c) {
+        if (c >= '0' && c <= '9') {
+            return c - '0';
+        }
+        if (c >= 'A' && c <= 'F') {
+            return c - ('A' - 10);
+        }
+        if (c >= 'a' && c <= 'f') {
+            return c - ('a' - 10);
+        }
+        return -1;
+    }
+
     /**
      * Converts an escape to a code point value. We attempt
      * to parallel the icu4c unescapeAt() function.
@@ -788,26 +810,26 @@ public final class Utility {
      * @return the code point and length, or -1 on error.
      */
     public static int unescapeAndLengthAt(CharSequence s, int offset) {
-        int c;
+        return unescapeAndLengthAt(s, offset, s.length());
+    }
+
+    private static int unescapeAndLengthAt(CharSequence s, int offset, int length) {
         int result = 0;
         int n = 0;
         int minDig = 0;
         int maxDig = 0;
         int bitsPerDigit = 4;
         int dig;
-        int i;
         boolean braces = false;
 
         /* Check that offset is in range */
-        int length = s.length();
         if (offset < 0 || offset >= length) {
             return -1;
         }
         int start = offset;
 
         /* Fetch first UChar after '\\' */
-        c = Character.codePointAt(s, offset);
-        offset += UTF16.getCharCount(c);
+        int c = s.charAt(offset++);
 
         /* Convert hexadecimal and octal escapes */
         switch (c) {
@@ -819,7 +841,7 @@ public final class Utility {
             break;
         case 'x':
             minDig = 1;
-            if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
+            if (offset < length && s.charAt(offset) == '{') {
                 ++offset;
                 braces = true;
                 maxDig = 8;
@@ -828,7 +850,7 @@ public final class Utility {
             }
             break;
         default:
-            dig = UCharacter.digit(c, 8);
+            dig = _digit8(c);
             if (dig >= 0) {
                 minDig = 1;
                 maxDig = 3;
@@ -840,20 +862,20 @@ public final class Utility {
         }
         if (minDig != 0) {
             while (offset < length && n < maxDig) {
-                c = UTF16.charAt(s, offset);
-                dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
+                c = s.charAt(offset);
+                dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c);
                 if (dig < 0) {
                     break;
                 }
                 result = (result << bitsPerDigit) | dig;
-                offset += UTF16.getCharCount(c);
+                ++offset;
                 ++n;
             }
             if (n < minDig) {
                 return -1;
             }
             if (braces) {
-                if (c != 0x7D /*}*/) {
+                if (c != '}') {
                     return -1;
                 }
                 ++offset;
@@ -867,9 +889,16 @@ public final class Utility {
             // supplementary.
             if (offset < length && UTF16.isLeadSurrogate(result)) {
                 int ahead = offset+1;
-                c = s.charAt(offset); // [sic] get 16-bit code unit
+                c = s.charAt(offset);
                 if (c == '\\' && ahead < length) {
-                    int cpAndLength = unescapeAndLengthAt(s, ahead);
+                    // Calling ourselves recursively may cause a stack overflow if
+                    // we have repeated escaped lead surrogates.
+                    // Limit the length to 11 ("x{0000DFFF}") after ahead.
+                    int tailLimit = ahead + 11;
+                    if (tailLimit > length) {
+                        tailLimit = length;
+                    }
+                    int cpAndLength = unescapeAndLengthAt(s, ahead, tailLimit);
                     if (cpAndLength >= 0) {
                         c = cpAndLength >> 8;
                         ahead += cpAndLength & 0xff;
@@ -877,14 +906,14 @@ public final class Utility {
                 }
                 if (UTF16.isTrailSurrogate(c)) {
                     offset = ahead;
-                    result = Character.toCodePoint((char) result, (char) c);
+                    result = UCharacter.toCodePoint(result, c);
                 }
             }
             return codePointAndLength(result, start, offset);
         }
 
         /* Convert C-style escapes in table */
-        for (i=0; i<UNESCAPE_MAP.length; i+=2) {
+        for (int i=0; i<UNESCAPE_MAP.length; i+=2) {
             if (c == UNESCAPE_MAP[i]) {
                 return codePointAndLength(UNESCAPE_MAP[i+1], start, offset);
             } else if (c < UNESCAPE_MAP[i]) {
@@ -894,12 +923,20 @@ public final class Utility {
 
         /* Map \cX to control-X: X & 0x1F */
         if (c == 'c' && offset < length) {
-            c = UTF16.charAt(s, offset);
-            return codePointAndLength(c & 0x1F, start, offset + UTF16.getCharCount(c));
+            c = Character.codePointAt(s, offset);
+            return codePointAndLength(c & 0x1F, start, offset + Character.charCount(c));
         }
 
         /* If no special forms are recognized, then consider
-         * the backslash to generically escape the next character. */
+         * the backslash to generically escape the next character.
+         * Deal with surrogate pairs. */
+        if (UTF16.isLeadSurrogate(c) && offset < length) {
+            int c2 = s.charAt(offset);
+            if (UTF16.isTrailSurrogate(c2)) {
+                ++offset;
+                c = UCharacter.toCodePoint(c, c2);
+            }
+        }
         return codePointAndLength(c, start, offset);
     }
 
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
index 912698734e5..cda5bf1839e 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@@ -225,9 +225,8 @@ import com.ibm.icu.util.VersionInfo;
  *     </tr>
  *     <tr style="vertical-align: top">
  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex :=&nbsp; </code></td>
- *       <td style="vertical-align: top;"><em>any character for which
- *       </em><code>Character.digit(c, 16)</code><em>
- *       returns a non-negative result</em></td>
+ *       <td style="vertical-align: top;"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br>
+ *       &nbsp;&nbsp;&nbsp;&nbsp;'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td>
  *     </tr>
  *     <tr>
  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property :=&nbsp; </code></td>
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
index 15d1e601049..ddc8b0d9aea 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
@@ -63,6 +63,17 @@ public class UtilityTest extends TestFmwk {
         assertTrue(pattern + " contains U+1DA8B", set.contains(0x1DA8B));
         assertTrue(pattern + " contains U+1DF00..U+1DF1E", set.contains(0x1DF00, 0x1DF1E));
         assertFalse(pattern + " contains U+1DF1F", set.contains(0x1DF1F));
+
+        // ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits
+        String euro = Utility.unescape("\\u20aC");
+        assertEquals("ASCII Euro", "€", euro);
+        try {
+            Utility.unescape("\\u୨෦ａＣ");
+            fail("unescape() accepted non-ASCII digits");
+        } catch(IllegalArgumentException expected) {
+        }
+        String euro2 = Utility.unescapeLeniently("\\u20aC\\u୨෦ａＣ");
+        assertEquals("lenient", "€\\u୨෦ａＣ", euro2);
     }
 
     @Test