ICU-21492 Fix regex compile assertion failure.

A regex pattern containing nested look-behind blocks could trigger an assertion
failure during pattern compilation. The problem was caused by an off-by-one
error in the code that computes an upper bound on the match length, needed
because look-behind expressions are constrained to not have unbounded match
length.

Nested look-behind blocks come into play because, when computing the maximum
match length of an outer block, any inner look-behind blocks are skipped over -
they do not directly contribute to the length matched by the outer block. The
problem was in the code that skips over these nested look-behind blocks.
This commit is contained in:
Andy Heninger 2021-02-13 13:28:10 -08:00 committed by Frank Yung-Fong Tang
parent 352b481146
commit f062244cdb
2 changed files with 10 additions and 2 deletions

View file

@ -3475,6 +3475,9 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
// value may be longer than the actual maximum; it must
// never be shorter.
//
// start, end: the range of the pattern to check.
// end is inclusive.
//
//------------------------------------------------------------------------------
int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
if (U_FAILURE(*fStatus)) {
@ -3720,14 +3723,14 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
// Look-behind. Scan forward until the matching look-around end,
// without processing the look-behind block.
int32_t dataLoc = URX_VAL(op);
for (loc = loc + 1; loc < end; ++loc) {
for (loc = loc + 1; loc <= end; ++loc) {
op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
int32_t opType = URX_TYPE(op);
if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
break;
}
}
U_ASSERT(loc < end);
U_ASSERT(loc <= end);
}
break;

View file

@ -1497,6 +1497,11 @@
#
"(?w)\b" v2 "äää<0></0> äää"
# Bug ICU-21492 Assertion failure with nested look-around expressions.
#
"(?<=(?:(?<=(?:(?<=(?:(?<=)){2})){3})){4}" E "<0></0>" # orig failure from bug report, w mismatched parens.
"(?:(?<=(?:(?<=)){2}))" "<0></0>" # Simplified case, with a valid pattern.
# Random debugging, Temporary
#