ICU-22517 Limit the closure expansion loop and return error

To avoid very slow return from the constructor, we return
error while the Collation rule expand too big.
Add a soft limit to limit to the number of loop needed for 8 Hanguls
  Necessary number of loop: H(0)=0; H(i)=3H(i-1)+2.
  Where i is the length of Hangul in the rule.
  H(1) = 2, H(2) = 8, H(3)=26, H(4)=80, H(5) = 242 ...
This commit is contained in:
Frank Tang 2023-09-25 17:20:22 -07:00 committed by Frank Yung-Fong Tang
parent f6d09d514d
commit 05b0e7abaf
5 changed files with 68 additions and 0 deletions

View file

@ -1113,12 +1113,23 @@ CollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeSt
return ce32;
}
// ICU-22517
// This constant defines a limit for the addOnlyClosure to return
// error, to avoid taking a long time for canonical closure expansion.
// Please let us know if you have a reasonable use case that needed
// for a practical Collation rule that needs to increase this limit.
// This value is needed for compiling a rule with eight Hangul syllables such as
// "&a=b쫊쫊쫊쫊쫊쫊쫊쫊" without error, which should be more than realistic
// usage.
static constexpr int32_t kClosureLoopLimit = 6560;
uint32_t
CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return ce32; }
int32_t loop = 0;
// Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)
if(nfdPrefix.isEmpty()) {
CanonicalIterator stringIter(nfdString, errorCode);
@ -1128,6 +1139,11 @@ CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeSt
UnicodeString str = stringIter.next();
if(str.isBogus()) { break; }
if(ignoreString(str, errorCode) || str == nfdString) { continue; }
if (loop++ > kClosureLoopLimit) {
// To avoid hang as in ICU-22517, return with error.
errorCode = U_INPUT_TOO_LONG_ERROR;
return ce32;
}
ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);
if(U_FAILURE(errorCode)) { return ce32; }
}
@ -1144,6 +1160,11 @@ CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeSt
UnicodeString str = stringIter.next();
if(str.isBogus()) { break; }
if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; }
if (loop++ > kClosureLoopLimit) {
// To avoid hang as in ICU-22517, return with error.
errorCode = U_INPUT_TOO_LONG_ERROR;
return ce32;
}
ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);
if(U_FAILURE(errorCode)) { return ce32; }
}

View file

@ -1249,6 +1249,18 @@ void CollationRegressionTest::TestBeforeWithTooStrongAfter() {
}
}
void CollationRegressionTest::TestICU22517() {
IcuTestErrorCode errorCode(*this, "TestICU22517");
char16_t data[] = u"&a=b쫊쫊쫊쫊쫊쫊쫊쫊";
icu::UnicodeString rule(true, data, -1);
int length = quick ? rule.length()-2 : rule.length();
for (int i = 4; i <= length; i++) {
UErrorCode status = U_ZERO_ERROR;
icu::LocalPointer<icu::RuleBasedCollator> col1(
new icu::RuleBasedCollator(rule.tempSubString(0, i), status));
}
}
void CollationRegressionTest::TestICU22277() {
IcuTestErrorCode errorCode(*this, "TestICU22277");
UErrorCode status = U_ZERO_ERROR;
@ -1408,6 +1420,7 @@ void CollationRegressionTest::runIndexedTest(int32_t index, UBool exec, const ch
TESTCASE_AUTO(TestTrailingComment);
TESTCASE_AUTO(TestBeforeWithTooStrongAfter);
TESTCASE_AUTO(TestICU22277);
TESTCASE_AUTO(TestICU22517);
TESTCASE_AUTO_END;
}

View file

@ -240,6 +240,8 @@ public:
// Test use-of-uninitialized-value
void TestICU22277();
void TestICU22517();
private:
//------------------------------------------------------------------------
// Internal utilities

View file

@ -24,6 +24,7 @@ import com.ibm.icu.text.Collator;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.ICUInputTooLongException;
import com.ibm.icu.util.ULocale;
public final class CollationBuilder extends CollationRuleParser.Sink {
@ -862,10 +863,21 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
return ce32;
}
// ICU-22517
// This constant defines a limit for the addOnlyClosure to return
// error, to avoid taking a long time for canonical closure expansion.
// Please let us know if you have a reasonable use case that needed
// for a practical Collation rule that needs to increase this limit.
// This value is needed for compiling a rule with eight Hangul syllables such as
// "&a=b쫊쫊쫊쫊쫊쫊쫊쫊" without error, which should be more than realistic
// usage.
static private int kClosureLoopLimit = 6560;
private int addOnlyClosure(CharSequence nfdPrefix, CharSequence nfdString,
long[] newCEs, int newCEsLength, int ce32) {
// Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)
// TODO: make CanonicalIterator work with CharSequence, or maybe change arguments here to String
int loop = 0;
if(nfdPrefix.length() == 0) {
CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
String prefix = "";
@ -873,6 +885,9 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
String str = stringIter.next();
if(str == null) { break; }
if(ignoreString(str) || str.contentEquals(nfdString)) { continue; }
if (loop++ > kClosureLoopLimit) {
throw new ICUInputTooLongException("Too many closure");
}
ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
}
} else {
@ -887,6 +902,9 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
String str = stringIter.next();
if(str == null) { break; }
if(ignoreString(str) || (samePrefix && str.contentEquals(nfdString))) { continue; }
if (loop++ > kClosureLoopLimit) {
throw new ICUInputTooLongException("Too many closure");
}
ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
}
stringIter.reset();

View file

@ -1229,6 +1229,20 @@ public class CollationRegressionTest extends TestFmwk {
assertTrue("b<a", coll.compare("b", "a") < 0);
}
@Test
public void TestICU22517() {
boolean quick = TestFmwk.getExhaustiveness() <= 5;
String rule = "&a=b쫊쫊쫊쫊쫊쫊쫊쫊";
int length = quick ? (rule.length()-2) : rule.length();
for (int i = 4; i <= length; i++) {
try {
RuleBasedCollator coll = new RuleBasedCollator(rule.substring(0, i));
} catch (Exception e) {
// silence ignore.
}
}
}
@Test
public void TestBeforeWithTooStrongAfter() {
// ICU ticket #9959: