ICU-21127 Error when rbbi got unpaired surrogate char

See #1520
This commit is contained in:
Frank Tang 2021-03-05 22:25:53 +00:00 committed by Frank Yung-Fong Tang
parent b1a685a676
commit ce640dc850
6 changed files with 88 additions and 0 deletions

View file

@ -856,6 +856,10 @@ UChar32 RBBIRuleScanner::nextCharLL() {
return (UChar32)-1;
}
ch = fRB->fRules.char32At(fNextIndex);
if (U_IS_SURROGATE(ch)) {
error(U_ILLEGAL_CHAR_FOUND);
return U_SENTINEL;
}
fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
if (ch == chCR ||

View file

@ -134,6 +134,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
TESTCASE_AUTO(TestTable_8_16_Bits);
TESTCASE_AUTO(TestBug13590);
TESTCASE_AUTO(TestUnpairedSurrogate);
#if U_ENABLE_TRACING
TESTCASE_AUTO(TestTraceCreateCharacter);
@ -5323,4 +5324,43 @@ void RBBITest::TestTraceCreateBreakEngine(void) {
}
#endif
void RBBITest::TestUnpairedSurrogate() {
UnicodeString rules(u"ab;");
UErrorCode status = U_ZERO_ERROR;
UParseError pe;
RuleBasedBreakIterator bi1(rules, pe, status);
assertSuccess(WHERE, status);
UnicodeString rtRules = bi1.getRules();
// make sure the simple one work first.
assertEquals(WHERE, rules, rtRules);
rules = UnicodeString(u"a\\ud800b;").unescape();
pe.line = 0;
pe.offset = 0;
RuleBasedBreakIterator bi2(rules, pe, status);
assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
if (pe.line != 1 || pe.offset != 1) {
errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
}
status = U_ZERO_ERROR;
rules = UnicodeString(u"a\\ude00b;").unescape();
pe.line = 0;
pe.offset = 0;
RuleBasedBreakIterator bi3(rules, pe, status);
assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
if (pe.line != 1 || pe.offset != 1) {
errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
}
// make sure the surrogate one work too.
status = U_ZERO_ERROR;
rules = UnicodeString(u"a😀b;");
RuleBasedBreakIterator bi4(rules, pe, status);
rtRules = bi4.getRules();
assertEquals(WHERE, rules, rtRules);
}
#endif // #if !UCONFIG_NO_BREAK_ITERATION

View file

@ -83,6 +83,7 @@ public:
void TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi);
void TestBug13692();
void TestDebugRules();
void TestUnpairedSurrogate();
void TestDebug();
void TestProperties();

View file

@ -85,6 +85,9 @@ class RBBIRuleBuilder {
// using these simplified the porting, and consolidated the
// creation of Java exceptions
//
static final int U_ILLEGAL_CHAR_FOUND = 12;
/**< Character conversion: Illegal input sequence/combination of input units. */
static final int U_BRK_ERROR_START = 0x10200;
/**< Start of codes indicating Break Iterator failures */

View file

@ -723,6 +723,9 @@ class RBBIRuleScanner {
return -1;
}
ch = UTF16.charAt(fRB.fRules, fNextIndex);
if (Character.isBmpCodePoint(ch) && Character.isSurrogate((char)ch)) {
error(RBBIRuleBuilder.U_ILLEGAL_CHAR_FOUND);
}
fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
if (ch == '\r' ||

View file

@ -905,4 +905,41 @@ public class RBBITest extends TestFmwk {
assertEquals("Wrong number of breaks found", 2, breaksFound);
}
/* Test handling of unpair surrogate.
*/
@Test
public void TestUnpairedSurrogate() {
// make sure the simple one work first.
String rules = "ab;";
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
assertEquals("Rules does not match", rules, bi.toString());
try {
new RuleBasedBreakIterator("a\ud800b;");
fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair low surrogate.");
}
catch (IllegalArgumentException e) {
// expected exception with unpair surrogate.
}
catch (Exception e) {
fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair low surrogate: " + e);
}
try {
new RuleBasedBreakIterator("a\ude00b;");
fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair high surrogate.");
}
catch (IllegalArgumentException e) {
// expected exception with unpair surrogate.
}
catch (Exception e) {
fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair high surrogate: " + e);
}
// make sure the surrogate one work too.
rules = "a😀b;";
bi = new RuleBasedBreakIterator(rules);
assertEquals("Rules does not match", rules, bi.toString());
}
}