diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index 4a4ce193b64..ed9a3eb72ff 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -1521,6 +1521,7 @@ private: UnicodeString& rebuiltPat, uint32_t options, UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), + int32_t depth, UErrorCode& ec); //---------------------------------------------------------------- diff --git a/icu4c/source/common/uniset_closure.cpp b/icu4c/source/common/uniset_closure.cpp index 44bb4bcd05f..0b7da796826 100644 --- a/icu4c/source/common/uniset_closure.cpp +++ b/icu4c/source/common/uniset_closure.cpp @@ -129,7 +129,7 @@ UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, // _applyPattern calls add() etc., which set pat to empty. UnicodeString rebuiltPat; RuleCharacterIterator chars(pattern, symbols, pos); - applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status); + applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, 0, status); if (U_FAILURE(status)) return *this; if (chars.inVariable()) { // syntaxError(chars, "Extra chars in variable value"); diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index c3482b09875..6ae6e71289b 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -257,6 +257,7 @@ const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { return i.fSet; } +namespace { // Cache some sets for other services -------------------------------------- *** void U_CALLCONV createUni32Set(UErrorCode &errorCode) { @@ -315,6 +316,8 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) { // memory leak checker tools #define _dbgct(me) +} // namespace + //---------------------------------------------------------------- // Constructors &c //---------------------------------------------------------------- @@ -382,7 +385,7 @@ UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, // _applyPattern calls add() etc., which set pat to empty. UnicodeString rebuiltPat; RuleCharacterIterator chars(pattern, symbols, pos); - applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status); + applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, 0, status); if (U_FAILURE(status)) return; if (chars.inVariable()) { // syntaxError(chars, "Extra chars in variable value"); @@ -406,6 +409,8 @@ UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { // Implementation: Pattern parsing //---------------------------------------------------------------- +namespace { + /** * A small all-inline class to manage a UnicodeSet pointer. Add * operator->() etc. as needed. @@ -424,6 +429,10 @@ public: } }; +constexpr int32_t MAX_DEPTH = 100; + +} // namespace + /** * Parse the pattern from the given RuleCharacterIterator. The * iterator is advanced over the parsed pattern. @@ -443,8 +452,13 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars, UnicodeString& rebuiltPat, uint32_t options, UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), + int32_t depth, UErrorCode& ec) { if (U_FAILURE(ec)) return; + if (depth > MAX_DEPTH) { + ec = U_ILLEGAL_ARGUMENT_ERROR; + return; + } // Syntax characters: [ ] ^ - & { } @@ -579,7 +593,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars, } switch (setMode) { case 1: - nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec); + nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec); break; case 2: chars.skipIgnored(opts); @@ -837,6 +851,8 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars, // Property set implementation //---------------------------------------------------------------- +namespace { + static UBool numericValueFilter(UChar32 ch, void* context) { return u_getNumericValue(ch) == *(double*)context; } @@ -868,6 +884,8 @@ static UBool scriptExtensionsFilter(UChar32 ch, void* context) { return uscript_hasScript(ch, *(UScriptCode*)context); } +} // namespace + /** * Generic filter-based scanning code for UCD property UnicodeSets. */ @@ -924,6 +942,8 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter, } } +namespace { + static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { /* Note: we use ' ' in compiler code page */ int32_t j = 0; @@ -941,6 +961,8 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { return TRUE; } +} // namespace + //---------------------------------------------------------------- // Property set API //---------------------------------------------------------------- diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index d142f048d58..b6f2eab6a84 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -42,15 +42,6 @@ UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) { return left + UnicodeSetTest::escape(pat); } -#define CASE(id,test) case id: \ - name = #test; \ - if (exec) { \ - logln(#test "---"); \ - logln(); \ - test(); \ - } \ - break - UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) { } @@ -100,6 +91,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestUCAUnsafeBackwards); TESTCASE_AUTO(TestIntOverflow); TESTCASE_AUTO(TestUnusedCcc); + TESTCASE_AUTO(TestDeepPattern); TESTCASE_AUTO_END; } @@ -3970,3 +3962,19 @@ void UnicodeSetTest::TestUnusedCcc() { assertTrue("[:ccc=1.1:] -> empty set", ccc1_1.isEmpty()); #endif } + +void UnicodeSetTest::TestDeepPattern() { + IcuTestErrorCode errorCode(*this, "TestDeepPattern"); + // Nested ranges are parsed via recursion which can use a lot of stack space. + // After a reasonable limit, we should get an error. + constexpr int32_t DEPTH = 20000; + UnicodeString pattern, suffix; + for (int32_t i = 0; i < DEPTH; ++i) { + pattern.append(u"[a", 2); + suffix.append(']'); + } + pattern.append(suffix); + UnicodeSet set(pattern, errorCode); + assertTrue("[a[a[a...1000s...]]] -> error", errorCode.isFailure()); + errorCode.reset(); +} diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index b34728ae631..e79a9e8e77d 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -93,6 +93,7 @@ private: void TestUCAUnsafeBackwards(); void TestIntOverflow(); void TestUnusedCcc(); + void TestDeepPattern(); private: diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java index 3270541e19c..14710c8e762 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java @@ -2422,7 +2422,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa StringBuilder rebuiltPat = new StringBuilder(); RuleCharacterIterator chars = new RuleCharacterIterator(pattern, symbols, pos); - applyPattern(chars, symbols, rebuiltPat, options); + applyPattern(chars, symbols, rebuiltPat, options, 0); if (chars.inVariable()) { syntaxError(chars, "Extra chars in variable value"); } @@ -2458,6 +2458,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa SETMODE2_PROPERTYPAT = 2, SETMODE3_PREPARSED = 3; + private static final int MAX_DEPTH = 100; + /** * Parse the pattern from the given RuleCharacterIterator. The * iterator is advanced over the parsed pattern. @@ -2473,7 +2475,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * IGNORE_SPACE, CASE. */ private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols, - Appendable rebuiltPat, int options) { + Appendable rebuiltPat, int options, int depth) { + if (depth > MAX_DEPTH) { + syntaxError(chars, "Pattern nested too deeply"); + } // Syntax characters: [ ] ^ - & { } @@ -2605,7 +2610,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa } switch (setMode) { case SETMODE1_UNICODESET: - nested.applyPattern(chars, symbols, patBuf, options); + nested.applyPattern(chars, symbols, patBuf, options, depth + 1); break; case SETMODE2_PROPERTYPAT: chars.skipIgnored(opts); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java index 5ebaa91b70e..ecb6767db7b 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java @@ -2773,4 +2773,23 @@ public class UnicodeSetTest extends TestFmwk { } catch (IllegalArgumentException expected) { } } + + @Test + public void TestDeepPattern() { + // Nested ranges are parsed via recursion which can use a lot of stack space. + // After a reasonable limit, we should get an error. + final int DEPTH = 20000; + StringBuilder pattern = new StringBuilder(); + StringBuilder suffix = new StringBuilder(); + for (int i = 0; i < DEPTH; ++i) { + pattern.append("[a"); + suffix.append(']'); + } + pattern.append(suffix); + try { + new UnicodeSet(pattern.toString()); + fail("[a[a[a...1000s...]]] did not throw an exception"); + } catch(RuntimeException expected) { + } + } }