ICU-22579 Fix Null deref while Unicode Set only has string

2025-04-10 07:39:16 +00:00 · 2023-12-11 14:19:28 -08:00 · 2023-12-11 14:19:28 -08:00 · 4a7d61d261
commit 4a7d61d261
parent 8b14c05791
5 changed files with 35 additions and 2 deletions
--- a/icu4c/source/common/rbbiscan.cpp
+++ b/icu4c/source/common/rbbiscan.cpp
@ -1224,6 +1224,7 @@ void RBBIRuleScanner::scanSet() {
    UErrorCode localStatus = U_ZERO_ERROR;
    LocalPointer<UnicodeSet> uset(new UnicodeSet(), localStatus);
    if (U_FAILURE(localStatus)) {
+        error(localStatus);
        return;
    }
    uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
@ -1240,7 +1241,11 @@ void RBBIRuleScanner::scanSet() {
    // Verify that the set contains at least one code point.
    //
    U_ASSERT(uset.isValid());
-    if (uset->isEmpty()) {
+    UnicodeSet tempSet(*uset);
+    // Use tempSet to handle the case that the UnicodeSet contains
+    // only string element, such as [{ab}] and treat it as empty set.
+    tempSet.removeAllStrings();
+    if (tempSet.isEmpty()) {
        // This set is empty.
        //  Make it an error, because it almost certainly is not what the user wanted.
        //  Also, avoids having to think about corner cases in the tree manipulation code
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -144,6 +144,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
    TESTCASE_AUTO(TestRandomAccess);
    TESTCASE_AUTO(TestExternalBreakEngineWithFakeTaiLe);
    TESTCASE_AUTO(TestExternalBreakEngineWithFakeYue);
+    TESTCASE_AUTO(TestBug22579);
    TESTCASE_AUTO(TestBug22581);
    TESTCASE_AUTO(TestBug22584);
    TESTCASE_AUTO(TestBug22585);
@ -5895,6 +5896,14 @@ void RBBITest::TestBug22584() {
    RuleBasedBreakIterator bi2(ruleStr, pe, ec);
 }

+void RBBITest::TestBug22579() {
+    // Test not causing null deref in cloneTree
+    UnicodeString ruleStr = u"[{ab}];";
+    UParseError pe {};
+    UErrorCode ec {U_ZERO_ERROR};
+
+    RuleBasedBreakIterator bi(ruleStr, pe, ec);
+}
 void RBBITest::TestBug22581() {
    // Test duplicate variable setting will not leak the rule compilation
    UnicodeString ruleStr = u"$foo=[abc]; $foo=[xyz]; $foo;";
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@ -98,6 +98,7 @@ public:
    void TestRandomAccess();
    void TestExternalBreakEngineWithFakeTaiLe();
    void TestExternalBreakEngineWithFakeYue();
+    void TestBug22579();
    void TestBug22581();
    void TestBug22584();
    void TestBug22585();
--- a/icu4j/main/core/src/main/java/com/ibm/icu/text/RBBIRuleScanner.java
+++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/RBBIRuleScanner.java
@ -1062,7 +1062,11 @@ class RBBIRuleScanner {

        // Verify that the set contains at least one code point.
        //
-        if (uset.isEmpty()) {
+        // Use tempSet to handle the case that the UnicodeSet contains
+        // only string element, such as [{ab}] and treat it as empty set.
+        UnicodeSet tempSet = new UnicodeSet(uset);
+        tempSet.removeAllStrings();
+        if (tempSet.isEmpty()) {
            // This set is empty.
            //  Make it an error, because it almost certainly is not what the user wanted.
            //  Also, avoids having to think about corner cases in the tree manipulation code
--- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java
+++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java
@ -946,6 +946,20 @@ public class RBBITest extends CoreTestFmwk {
        assertEquals("Rules does not match", rules, bi.toString());
    }

+    @Test
+    public void TestBug22579() {
+        try {
+            new RuleBasedBreakIterator("[{ab}];");
+            fail("TestBug22579: RuleBasedBreakIterator() failed to throw an exception with only string in an Unicode set.");
+        }
+        catch (IllegalArgumentException e) {
+            // expected exception with only string inside an Unicode set.
+        }
+        catch (Exception e) {
+            fail("TestBug22579: Unexpected exception while new RuleBasedBreakIterator() with only string in an Unicode Set: " + e);
+        }
+
+    }
    @Test
    public void TestBug22585() {
        try {