From ce640dc85040b5312b2fb22439853cf3107eb02c Mon Sep 17 00:00:00 2001 From: Frank Tang Date: Fri, 5 Mar 2021 22:25:53 +0000 Subject: [PATCH] ICU-21127 Error when rbbi got unpaired surrogate char See #1520 --- icu4c/source/common/rbbiscan.cpp | 4 ++ icu4c/source/test/intltest/rbbitst.cpp | 40 +++++++++++++++++++ icu4c/source/test/intltest/rbbitst.h | 1 + .../src/com/ibm/icu/text/RBBIRuleBuilder.java | 3 ++ .../src/com/ibm/icu/text/RBBIRuleScanner.java | 3 ++ .../com/ibm/icu/dev/test/rbbi/RBBITest.java | 37 +++++++++++++++++ 6 files changed, 88 insertions(+) diff --git a/icu4c/source/common/rbbiscan.cpp b/icu4c/source/common/rbbiscan.cpp index 10b7e9b68ee..45911b1cfe0 100644 --- a/icu4c/source/common/rbbiscan.cpp +++ b/icu4c/source/common/rbbiscan.cpp @@ -856,6 +856,10 @@ UChar32 RBBIRuleScanner::nextCharLL() { return (UChar32)-1; } ch = fRB->fRules.char32At(fNextIndex); + if (U_IS_SURROGATE(ch)) { + error(U_ILLEGAL_CHAR_FOUND); + return U_SENTINEL; + } fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1); if (ch == chCR || diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 8e3086b5151..b02478c48bf 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -134,6 +134,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable); TESTCASE_AUTO(TestTable_8_16_Bits); TESTCASE_AUTO(TestBug13590); + TESTCASE_AUTO(TestUnpairedSurrogate); #if U_ENABLE_TRACING TESTCASE_AUTO(TestTraceCreateCharacter); @@ -5323,4 +5324,43 @@ void RBBITest::TestTraceCreateBreakEngine(void) { } #endif +void RBBITest::TestUnpairedSurrogate() { + UnicodeString rules(u"ab;"); + + UErrorCode status = U_ZERO_ERROR; + UParseError pe; + RuleBasedBreakIterator bi1(rules, pe, status); + assertSuccess(WHERE, status); + UnicodeString rtRules = bi1.getRules(); + // make sure the simple one work first. + assertEquals(WHERE, rules, rtRules); + + + rules = UnicodeString(u"a\\ud800b;").unescape(); + pe.line = 0; + pe.offset = 0; + RuleBasedBreakIterator bi2(rules, pe, status); + assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status); + if (pe.line != 1 || pe.offset != 1) { + errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset); + } + + status = U_ZERO_ERROR; + rules = UnicodeString(u"a\\ude00b;").unescape(); + pe.line = 0; + pe.offset = 0; + RuleBasedBreakIterator bi3(rules, pe, status); + assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status); + if (pe.line != 1 || pe.offset != 1) { + errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset); + } + + // make sure the surrogate one work too. + status = U_ZERO_ERROR; + rules = UnicodeString(u"a😀b;"); + RuleBasedBreakIterator bi4(rules, pe, status); + rtRules = bi4.getRules(); + assertEquals(WHERE, rules, rtRules); +} + #endif // #if !UCONFIG_NO_BREAK_ITERATION diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index da144114af8..754b3e69ea3 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -83,6 +83,7 @@ public: void TestReverse(std::unique_ptrbi); void TestBug13692(); void TestDebugRules(); + void TestUnpairedSurrogate(); void TestDebug(); void TestProperties(); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java index b086e32655d..1da270347c8 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java @@ -85,6 +85,9 @@ class RBBIRuleBuilder { // using these simplified the porting, and consolidated the // creation of Java exceptions // + static final int U_ILLEGAL_CHAR_FOUND = 12; + /**< Character conversion: Illegal input sequence/combination of input units. */ + static final int U_BRK_ERROR_START = 0x10200; /**< Start of codes indicating Break Iterator failures */ diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java index fae2773bac5..c9a8aff5a6d 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java @@ -723,6 +723,9 @@ class RBBIRuleScanner { return -1; } ch = UTF16.charAt(fRB.fRules, fNextIndex); + if (Character.isBmpCodePoint(ch) && Character.isSurrogate((char)ch)) { + error(RBBIRuleBuilder.U_ILLEGAL_CHAR_FOUND); + } fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1); if (ch == '\r' || diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java index 268f6c03f44..dae29ad0785 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java @@ -905,4 +905,41 @@ public class RBBITest extends TestFmwk { assertEquals("Wrong number of breaks found", 2, breaksFound); } + /* Test handling of unpair surrogate. + */ + @Test + public void TestUnpairedSurrogate() { + // make sure the simple one work first. + String rules = "ab;"; + RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules); + assertEquals("Rules does not match", rules, bi.toString()); + + try { + new RuleBasedBreakIterator("a\ud800b;"); + fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair low surrogate."); + } + catch (IllegalArgumentException e) { + // expected exception with unpair surrogate. + } + catch (Exception e) { + fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair low surrogate: " + e); + } + + try { + new RuleBasedBreakIterator("a\ude00b;"); + fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair high surrogate."); + } + catch (IllegalArgumentException e) { + // expected exception with unpair surrogate. + } + catch (Exception e) { + fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair high surrogate: " + e); + } + + + // make sure the surrogate one work too. + rules = "a😀b;"; + bi = new RuleBasedBreakIterator(rules); + assertEquals("Rules does not match", rules, bi.toString()); + } }