ICU-22585 Fix infinity loop while unicode set contains single surrogate

This commit is contained in:
Frank Tang 2023-12-09 01:45:39 -08:00 committed by Frank Yung-Fong Tang
parent 7d3cd7cba5
commit 8b14c05791
4 changed files with 58 additions and 18 deletions

View file

@ -770,6 +770,7 @@ void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, Unicode
RBBINode *usetNode = new RBBINode(RBBINode::uset);
if (usetNode == nullptr) {
error(U_MEMORY_ALLOCATION_ERROR);
delete setToAdopt;
return;
}
usetNode->fInputSet = setToAdopt;
@ -1210,7 +1211,6 @@ RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) {
//
//------------------------------------------------------------------------------
void RBBIRuleScanner::scanSet() {
UnicodeSet *uset;
ParsePosition pos;
int startPos;
int i;
@ -1222,12 +1222,11 @@ void RBBIRuleScanner::scanSet() {
pos.setIndex(fScanIndex);
startPos = fScanIndex;
UErrorCode localStatus = U_ZERO_ERROR;
uset = new UnicodeSet();
if (uset == nullptr) {
localStatus = U_MEMORY_ALLOCATION_ERROR;
} else {
uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
LocalPointer<UnicodeSet> uset(new UnicodeSet(), localStatus);
if (U_FAILURE(localStatus)) {
return;
}
uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
if (U_FAILURE(localStatus)) {
// TODO: Get more accurate position of the error from UnicodeSet's return info.
// UnicodeSet appears to not be reporting correctly at this time.
@ -1235,20 +1234,18 @@ void RBBIRuleScanner::scanSet() {
RBBIDebugPrintf("UnicodeSet parse position.ErrorIndex = %d\n", pos.getIndex());
#endif
error(localStatus);
delete uset;
return;
}
// Verify that the set contains at least one code point.
//
U_ASSERT(uset!=nullptr);
U_ASSERT(uset.isValid());
if (uset->isEmpty()) {
// This set is empty.
// Make it an error, because it almost certainly is not what the user wanted.
// Also, avoids having to think about corner cases in the tree manipulation code
// that occurs later on.
error(U_BRK_RULE_EMPTY_SET);
delete uset;
return;
}
@ -1257,7 +1254,7 @@ void RBBIRuleScanner::scanSet() {
// Don't just set fScanIndex because the line/char positions maintained
// for error reporting would be thrown off.
i = pos.getIndex();
for (;;) {
for (;U_SUCCESS(*fRB->fStatus);) {
if (fNextIndex >= i) {
break;
}
@ -1280,7 +1277,7 @@ void RBBIRuleScanner::scanSet() {
// character categories for run time engine.
// - Eliminates mulitiple instances of the same set.
// - Creates a new uset node if necessary (if this isn't a duplicate.)
findSetFor(n->fText, n, uset);
findSetFor(n->fText, n, uset.orphan());
}
}

View file

@ -146,6 +146,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestExternalBreakEngineWithFakeYue);
TESTCASE_AUTO(TestBug22581);
TESTCASE_AUTO(TestBug22584);
TESTCASE_AUTO(TestBug22585);
#if U_ENABLE_TRACING
TESTCASE_AUTO(TestTraceCreateCharacter);
@ -5860,6 +5861,23 @@ void RBBITest::TestExternalBreakEngineWithFakeTaiLe() {
expected2 == actual2);
}
// Test a single unpaired unpaired char (either surrogate low or high) in
// an Unicode set will not cause infinity loop.
void RBBITest::TestBug22585() {
UnicodeString rule = u"$a=[";
rule.append(0xdecb) // an unpaired surrogate high
.append("];");
UParseError pe {};
UErrorCode ec {U_ZERO_ERROR};
RuleBasedBreakIterator bi(rule, pe, ec);
rule = u"$a=[";
rule.append(0xd94e) // an unpaired surrogate low
.append("];");
ec = U_ZERO_ERROR;
RuleBasedBreakIterator bi2(rule, pe, ec);
}
void RBBITest::TestBug22584() {
// Creating a break iterator from a rule consisting of a very long
// literal input string caused a stack overflow when deleting the

View file

@ -100,6 +100,7 @@ public:
void TestExternalBreakEngineWithFakeYue();
void TestBug22581();
void TestBug22584();
void TestBug22585();
#if U_ENABLE_TRACING
void TestTraceCreateCharacter();

View file

@ -908,7 +908,7 @@ public class RBBITest extends CoreTestFmwk {
assertEquals("Wrong number of breaks found", 2, breaksFound);
}
/* Test handling of unpair surrogate.
/* Test handling of unpaired surrogate.
*/
@Test
public void TestUnpairedSurrogate() {
@ -919,24 +919,24 @@ public class RBBITest extends CoreTestFmwk {
try {
new RuleBasedBreakIterator("a\ud800b;");
fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair low surrogate.");
fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpaired low surrogate.");
}
catch (IllegalArgumentException e) {
// expected exception with unpair surrogate.
// expected exception with unpaired surrogate.
}
catch (Exception e) {
fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair low surrogate: " + e);
fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpaired low surrogate: " + e);
}
try {
new RuleBasedBreakIterator("a\ude00b;");
fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair high surrogate.");
fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpaired high surrogate.");
}
catch (IllegalArgumentException e) {
// expected exception with unpair surrogate.
// expected exception with unpaired surrogate.
}
catch (Exception e) {
fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair high surrogate: " + e);
fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpaired high surrogate: " + e);
}
@ -946,6 +946,30 @@ public class RBBITest extends CoreTestFmwk {
assertEquals("Rules does not match", rules, bi.toString());
}
@Test
public void TestBug22585() {
try {
new RuleBasedBreakIterator("$a=[\udecb];");
fail("TestBug22585: RuleBasedBreakIterator() failed to throw an exception with unpaired high surrogate.");
}
catch (IllegalArgumentException e) {
// expected exception with unpaired surrogate.
}
catch (Exception e) {
fail("TestBug22585: Unexpected exception while new RuleBasedBreakIterator() with unpaired high surrogate: " + e);
}
try {
new RuleBasedBreakIterator("$a=[\ud94e];");
fail("TestBug22585: RuleBasedBreakIterator() failed to throw an exception with unpaired low surrogate.");
}
catch (IllegalArgumentException e) {
// expected exception with unpaired surrogate.
}
catch (Exception e) {
fail("TestBug22585: Unexpected exception while new RuleBasedBreakIterator() with unpaired low surrogate: " + e);
}
}
/* Test preceding(index) and following(index), with semi-random indexes.
* The random indexes are produced in clusters that are relatively closely spaced,
* to increase the occurrences of hits to the internal break cache.