diff --git a/icu4c/source/common/ustring.cpp b/icu4c/source/common/ustring.cpp index 7ab2e1bf891..bba2d45c4e8 100644 --- a/icu4c/source/common/ustring.cpp +++ b/icu4c/source/common/ustring.cpp @@ -1294,7 +1294,15 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt, int32_t ahead = *offset + 1; c = charAt(*offset, context); if (c == 0x5C /*'\\'*/ && ahead < length) { - c = (UChar) u_unescapeAt(charAt, &ahead, length, context); + // Calling u_unescapeAt recursively may cause a stack overflow if + // we have repeated surrogate lead after that. Limit the + // length to 5 ('u' and 4 hex) after ahead. + int32_t tailLimit = ahead + 5; + if (tailLimit > length) { + tailLimit = length; + } + c = (UChar) u_unescapeAt(charAt, &ahead, tailLimit, + context); } if (U16_IS_TRAIL(c)) { *offset = ahead; diff --git a/icu4c/source/test/cintltst/custrtst.c b/icu4c/source/test/cintltst/custrtst.c index 70bdb1a1044..a1fc7b9c815 100644 --- a/icu4c/source/test/cintltst/custrtst.c +++ b/icu4c/source/test/cintltst/custrtst.c @@ -22,6 +22,7 @@ #include "unicode/ucnv.h" #include "unicode/uiter.h" #include "cintltst.h" +#include "cstring.h" #include "cmemory.h" #include @@ -36,6 +37,7 @@ static void TestStringFunctions(void); static void TestStringSearching(void); static void TestSurrogateSearching(void); static void TestUnescape(void); +static void TestUnescapeRepeatedSurrogateLead20725(void); static void TestCountChar32(void); static void TestUCharIterator(void); @@ -48,6 +50,8 @@ void addUStringTest(TestNode** root) addTest(root, &TestStringSearching, "tsutil/custrtst/TestStringSearching"); addTest(root, &TestSurrogateSearching, "tsutil/custrtst/TestSurrogateSearching"); addTest(root, &TestUnescape, "tsutil/custrtst/TestUnescape"); + addTest(root, &TestUnescapeRepeatedSurrogateLead20725, + "tsutil/custrtst/TestUnescapeRepeatedSurrogateLead20725"); addTest(root, &TestCountChar32, "tsutil/custrtst/TestCountChar32"); addTest(root, &TestUCharIterator, "tsutil/custrtst/TestUCharIterator"); } @@ -1124,6 +1128,55 @@ TestUnescape() { /* ### TODO: test u_unescapeAt() */ } +static void +TestUnescapeRepeatedSurrogateLead20725() { + const int32_t repeat = 20000; + const int32_t srclen = repeat * 6 + 1; + char *src = (char*)malloc(srclen); + UChar *dest = (UChar*) malloc(sizeof(UChar) * (repeat + 1)); + if (src == NULL || dest == NULL) { + log_err("memory allocation error"); + } + for (int32_t i = 0; i < repeat; i++) { + uprv_strcpy(src + (i * 6), "\\ud841"); + } + int32_t len = u_unescape(src, dest, repeat); + if (len != repeat) { + log_err("failure in u_unescape()"); + } + for (int32_t i = 0; i < repeat; i++) { + if (dest[i] != 0xd841) { + log_err("failure in u_unescape() return value"); + } + } + free(src); + + // A few simple test cases to make sure that the code recovers properly + u_unescape("\\ud841\\x5A", dest, repeat); + const UChar expected1[] = {0xd841, 'Z', 0}; + if (u_strcmp(dest, expected1)!=0) { + log_err("u_unescape() should return u\"\\ud841Z\" but got %s", dest); + } + + u_unescape("\\ud841\\U00050005", dest, repeat); + const UChar expected2[] = {0xd841, 0xd900, 0xdc05, 0}; + if (u_strcmp(dest, expected2)!=0) { + log_err("u_unescape() should return u\"\\ud841\\ud900\\udc05\" " + "but got %s", dest); + } + + // \\xXX is ill-formed. The documentation states: + // If an escape sequence is ill-formed, this method returns an empty string. + u_unescape("\\ud841\\xXX", dest, repeat); + const UChar expected3[] = { 0 }; + if (u_strcmp(dest, expected3)!=0) { + log_err("u_unescape() should return empty string"); + } + + free(dest); + +} + /* test code point counting functions --------------------------------------- */ /* reference implementation of u_strHasMoreChar32Than() */