diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index 5d1eb629d42..cacc069e962 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -561,7 +561,7 @@ UBool RegexCompile::doParseActions(int32_t action) // sequence; don't change without making updates there too. // // Compiles to - // 1 START_LA dataLoc Saves SP, Input Pos + // 1 LA_START dataLoc Saves SP, Input Pos, Active input region. // 2. STATE_SAVE 4 on failure of lookahead, goto 4 // 3 JMP 6 continue ... // @@ -575,10 +575,14 @@ UBool RegexCompile::doParseActions(int32_t action) // 8. code for parenthesized stuff. // 9. LA_END // - // Two data slots are reserved, for saving the stack ptr and the input position. + // Four data slots are reserved, for saving state on entry to the look-around + // 0: stack pointer on entry. + // 1: input position on entry. + // 2: fActiveStart, the active bounds start on entry. + // 3: fActiveLimit, the active bounds limit on entry. { fixLiterals(); - int32_t dataLoc = allocateData(2); + int32_t dataLoc = allocateData(4); appendOp(URX_LA_START, dataLoc); appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3); @@ -599,18 +603,23 @@ UBool RegexCompile::doParseActions(int32_t action) case doOpenLookAheadNeg: // Negated Lookahead. (?! stuff ) // Compiles to - // 1. START_LA dataloc + // 1. LA_START dataloc // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state, // // which continues with the match. // 3. NOP // Std. Open Paren sequence, for possible '|' // 4. code for parenthesized stuff. - // 5. END_LA // Cut back stack, remove saved state from step 2. + // 5. LA_END // Cut back stack, remove saved state from step 2. // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails. // 7. END_LA // Restore match region, in case look-ahead was using // an alternate (transparent) region. + // Four data slots are reserved, for saving state on entry to the look-around + // 0: stack pointer on entry. + // 1: input position on entry. + // 2: fActiveStart, the active bounds start on entry. + // 3: fActiveLimit, the active bounds limit on entry. { fixLiterals(); - int32_t dataLoc = allocateData(2); + int32_t dataLoc = allocateData(4); appendOp(URX_LA_START, dataLoc); appendOp(URX_STATE_SAVE, 0); // dest address will be patched later. appendOp(URX_NOP, 0); @@ -644,14 +653,16 @@ UBool RegexCompile::doParseActions(int32_t action) // Allocate a block of matcher data, to contain (when running a match) // 0: Stack ptr on entry // 1: Input Index on entry - // 2: Start index of match current match attempt. - // 3: Original Input String len. + // 2: fActiveStart, the active bounds start on entry. + // 3: fActiveLimit, the active bounds limit on entry. + // 4: Start index of match current match attempt. + // The first four items must match the layout of data for LA_START / LA_END // Generate match code for any pending literals. fixLiterals(); // Allocate data space - int32_t dataLoc = allocateData(4); + int32_t dataLoc = allocateData(5); // Emit URX_LB_START appendOp(URX_LB_START, dataLoc); @@ -696,14 +707,16 @@ UBool RegexCompile::doParseActions(int32_t action) // Allocate a block of matcher data, to contain (when running a match) // 0: Stack ptr on entry // 1: Input Index on entry - // 2: Start index of match current match attempt. - // 3: Original Input String len. + // 2: fActiveStart, the active bounds start on entry. + // 3: fActiveLimit, the active bounds limit on entry. + // 4: Start index of match current match attempt. + // The first four items must match the layout of data for LA_START / LA_END // Generate match code for any pending literals. fixLiterals(); // Allocate data space - int32_t dataLoc = allocateData(4); + int32_t dataLoc = allocateData(5); // Emit URX_LB_START appendOp(URX_LB_START, dataLoc); diff --git a/icu4c/source/i18n/regeximp.h b/icu4c/source/i18n/regeximp.h index 51db8821678..590d2168952 100644 --- a/icu4c/source/i18n/regeximp.h +++ b/icu4c/source/i18n/regeximp.h @@ -123,7 +123,7 @@ enum { // saved input position, FAIL rather than taking // the JMP URX_LA_START = 37, // Starting a LookAround expression. - // Save InputPos and SP in static data. + // Save InputPos, SP and active region in static data. // Operand: Static data offset for the save URX_LA_END = 38, // Ending a Lookaround expression. // Restore InputPos and Stack to saved values. diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index ef06c21e7e0..6d6ea0fb5dd 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -3805,11 +3805,13 @@ GC_Done: case URX_LA_START: { - // Entering a lookahead block. + // Entering a look around block. // Save Stack Ptr, Input Pos. - U_ASSERT(opValue>=0 && opValue+1fDataSize); + U_ASSERT(opValue>=0 && opValue+3fDataSize); fData[opValue] = fStack->size(); fData[opValue+1] = fp->fInputIdx; + fData[opValue+2] = fActiveStart; + fData[opValue+3] = fActiveLimit; fActiveStart = fLookStart; // Set the match region change for fActiveLimit = fLookLimit; // transparent bounds. } @@ -3819,7 +3821,7 @@ GC_Done: { // Leaving a look-ahead block. // restore Stack Ptr, Input Pos to positions they had on entry to block. - U_ASSERT(opValue>=0 && opValue+1fDataSize); + U_ASSERT(opValue>=0 && opValue+3fDataSize); int32_t stackSize = fStack->size(); int32_t newStackSize =(int32_t)fData[opValue]; U_ASSERT(stackSize >= newStackSize); @@ -3839,8 +3841,10 @@ GC_Done: // Restore the active region bounds in the input string; they may have // been changed because of transparent bounds on a Region. - fActiveStart = fRegionStart; - fActiveLimit = fRegionLimit; + fActiveStart = fData[opValue+2]; + fActiveLimit = fData[opValue+3]; + U_ASSERT(fActiveStart >= 0); + U_ASSERT(fActiveLimit <= fInputLength); } break; @@ -3916,17 +3920,19 @@ GC_Done: case URX_LB_START: { // Entering a look-behind block. - // Save Stack Ptr, Input Pos. + // Save Stack Ptr, Input Pos and active input region. // TODO: implement transparent bounds. Ticket #6067 - U_ASSERT(opValue>=0 && opValue+1fDataSize); + U_ASSERT(opValue>=0 && opValue+4fDataSize); fData[opValue] = fStack->size(); fData[opValue+1] = fp->fInputIdx; - // Init the variable containing the start index for attempted matches. - fData[opValue+2] = -1; // Save input string length, then reset to pin any matches to end at // the current position. + fData[opValue+2] = fActiveStart; fData[opValue+3] = fActiveLimit; + fActiveStart = fRegionStart; fActiveLimit = fp->fInputIdx; + // Init the variable containing the start index for attempted matches. + fData[opValue+4] = -1; } break; @@ -3949,8 +3955,8 @@ GC_Done: U_ASSERT(minML >= 0); // Fetch (from data) the last input index where a match was attempted. - U_ASSERT(opValue>=0 && opValue+1fDataSize); - int64_t &lbStartIdx = fData[opValue+2]; + U_ASSERT(opValue>=0 && opValue+4fDataSize); + int64_t &lbStartIdx = fData[opValue+4]; if (lbStartIdx < 0) { // First time through loop. lbStartIdx = fp->fInputIdx - minML; @@ -3976,10 +3982,10 @@ GC_Done: // getting a match. Backtrack out, and out of the // Look Behind altogether. fp = (REStackFrame *)fStack->popFrame(fFrameSize); - int64_t restoreInputLen = fData[opValue+3]; - U_ASSERT(restoreInputLen >= fActiveLimit); - U_ASSERT(restoreInputLen <= fInputLength); - fActiveLimit = restoreInputLen; + fActiveStart = fData[opValue+2]; + fActiveLimit = fData[opValue+3]; + U_ASSERT(fActiveStart >= 0); + U_ASSERT(fActiveLimit <= fInputLength); break; } @@ -3993,7 +3999,7 @@ GC_Done: case URX_LB_END: // End of a look-behind block, after a successful match. { - U_ASSERT(opValue>=0 && opValue+1fDataSize); + U_ASSERT(opValue>=0 && opValue+4fDataSize); if (fp->fInputIdx != fActiveLimit) { // The look-behind expression matched, but the match did not // extend all the way to the point that we are looking behind from. @@ -4004,13 +4010,13 @@ GC_Done: break; } - // Look-behind match is good. Restore the orignal input string length, + // Look-behind match is good. Restore the orignal input string region, // which had been truncated to pin the end of the lookbehind match to the // position being looked-behind. - int64_t originalInputLen = fData[opValue+3]; - U_ASSERT(originalInputLen >= fActiveLimit); - U_ASSERT(originalInputLen <= fInputLength); - fActiveLimit = originalInputLen; + fActiveStart = fData[opValue+2]; + fActiveLimit = fData[opValue+3]; + U_ASSERT(fActiveStart >= 0); + U_ASSERT(fActiveLimit <= fInputLength); } break; @@ -4035,8 +4041,8 @@ GC_Done: U_ASSERT(continueLoc > fp->fPatIdx); // Fetch (from data) the last input index where a match was attempted. - U_ASSERT(opValue>=0 && opValue+1fDataSize); - int64_t &lbStartIdx = fData[opValue+2]; + U_ASSERT(opValue>=0 && opValue+4fDataSize); + int64_t &lbStartIdx = fData[opValue+4]; if (lbStartIdx < 0) { // First time through loop. lbStartIdx = fp->fInputIdx - minML; @@ -4061,10 +4067,10 @@ GC_Done: // We have tried all potential match starting points without // getting a match, which means that the negative lookbehind as // a whole has succeeded. Jump forward to the continue location - int64_t restoreInputLen = fData[opValue+3]; - U_ASSERT(restoreInputLen >= fActiveLimit); - U_ASSERT(restoreInputLen <= fInputLength); - fActiveLimit = restoreInputLen; + fActiveStart = fData[opValue+2]; + fActiveLimit = fData[opValue+3]; + U_ASSERT(fActiveStart >= 0); + U_ASSERT(fActiveLimit <= fInputLength); fp->fPatIdx = continueLoc; break; } @@ -4079,7 +4085,7 @@ GC_Done: case URX_LBN_END: // End of a negative look-behind block, after a successful match. { - U_ASSERT(opValue>=0 && opValue+1fDataSize); + U_ASSERT(opValue>=0 && opValue+4fDataSize); if (fp->fInputIdx != fActiveLimit) { // The look-behind expression matched, but the match did not // extend all the way to the point that we are looking behind from. @@ -4096,10 +4102,10 @@ GC_Done: // Restore the orignal input string length, which had been truncated // inorder to pin the end of the lookbehind match // to the position being looked-behind. - int64_t originalInputLen = fData[opValue+3]; - U_ASSERT(originalInputLen >= fActiveLimit); - U_ASSERT(originalInputLen <= fInputLength); - fActiveLimit = originalInputLen; + fActiveStart = fData[opValue+2]; + fActiveLimit = fData[opValue+3]; + U_ASSERT(fActiveStart >= 0); + U_ASSERT(fActiveLimit <= fInputLength); // Restore original stack position, discarding any state saved // by the successful pattern match. @@ -5336,11 +5342,13 @@ GC_Done: case URX_LA_START: { - // Entering a lookahead block. + // Entering a look around block. // Save Stack Ptr, Input Pos. - U_ASSERT(opValue>=0 && opValue+1fDataSize); + U_ASSERT(opValue>=0 && opValue+3fDataSize); fData[opValue] = fStack->size(); fData[opValue+1] = fp->fInputIdx; + fData[opValue+2] = fActiveStart; + fData[opValue+3] = fActiveLimit; fActiveStart = fLookStart; // Set the match region change for fActiveLimit = fLookLimit; // transparent bounds. } @@ -5348,9 +5356,9 @@ GC_Done: case URX_LA_END: { - // Leaving a look-ahead block. + // Leaving a look around block. // restore Stack Ptr, Input Pos to positions they had on entry to block. - U_ASSERT(opValue>=0 && opValue+1fDataSize); + U_ASSERT(opValue>=0 && opValue+3fDataSize); int32_t stackSize = fStack->size(); int32_t newStackSize = (int32_t)fData[opValue]; U_ASSERT(stackSize >= newStackSize); @@ -5370,8 +5378,10 @@ GC_Done: // Restore the active region bounds in the input string; they may have // been changed because of transparent bounds on a Region. - fActiveStart = fRegionStart; - fActiveLimit = fRegionLimit; + fActiveStart = fData[opValue+2]; + fActiveLimit = fData[opValue+3]; + U_ASSERT(fActiveStart >= 0); + U_ASSERT(fActiveLimit <= fInputLength); } break; @@ -5434,17 +5444,19 @@ GC_Done: case URX_LB_START: { // Entering a look-behind block. - // Save Stack Ptr, Input Pos. + // Save Stack Ptr, Input Pos and active input region. // TODO: implement transparent bounds. Ticket #6067 - U_ASSERT(opValue>=0 && opValue+1fDataSize); + U_ASSERT(opValue>=0 && opValue+4fDataSize); fData[opValue] = fStack->size(); fData[opValue+1] = fp->fInputIdx; - // Init the variable containing the start index for attempted matches. - fData[opValue+2] = -1; // Save input string length, then reset to pin any matches to end at // the current position. + fData[opValue+2] = fActiveStart; fData[opValue+3] = fActiveLimit; + fActiveStart = fRegionStart; fActiveLimit = fp->fInputIdx; + // Init the variable containing the start index for attempted matches. + fData[opValue+4] = -1; } break; @@ -5462,8 +5474,8 @@ GC_Done: U_ASSERT(minML >= 0); // Fetch (from data) the last input index where a match was attempted. - U_ASSERT(opValue>=0 && opValue+1fDataSize); - int64_t &lbStartIdx = fData[opValue+2]; + U_ASSERT(opValue>=0 && opValue+4fDataSize); + int64_t &lbStartIdx = fData[opValue+4]; if (lbStartIdx < 0) { // First time through loop. lbStartIdx = fp->fInputIdx - minML; @@ -5485,10 +5497,10 @@ GC_Done: // getting a match. Backtrack out, and out of the // Look Behind altogether. fp = (REStackFrame *)fStack->popFrame(fFrameSize); - int64_t restoreInputLen = fData[opValue+3]; - U_ASSERT(restoreInputLen >= fActiveLimit); - U_ASSERT(restoreInputLen <= fInputLength); - fActiveLimit = restoreInputLen; + fActiveStart = fData[opValue+2]; + fActiveLimit = fData[opValue+3]; + U_ASSERT(fActiveStart >= 0); + U_ASSERT(fActiveLimit <= fInputLength); break; } @@ -5502,7 +5514,7 @@ GC_Done: case URX_LB_END: // End of a look-behind block, after a successful match. { - U_ASSERT(opValue>=0 && opValue+1fDataSize); + U_ASSERT(opValue>=0 && opValue+4fDataSize); if (fp->fInputIdx != fActiveLimit) { // The look-behind expression matched, but the match did not // extend all the way to the point that we are looking behind from. @@ -5513,13 +5525,13 @@ GC_Done: break; } - // Look-behind match is good. Restore the orignal input string length, + // Look-behind match is good. Restore the orignal input string region, // which had been truncated to pin the end of the lookbehind match to the // position being looked-behind. - int64_t originalInputLen = fData[opValue+3]; - U_ASSERT(originalInputLen >= fActiveLimit); - U_ASSERT(originalInputLen <= fInputLength); - fActiveLimit = originalInputLen; + fActiveStart = fData[opValue+2]; + fActiveLimit = fData[opValue+3]; + U_ASSERT(fActiveStart >= 0); + U_ASSERT(fActiveLimit <= fInputLength); } break; @@ -5539,8 +5551,8 @@ GC_Done: U_ASSERT(continueLoc > fp->fPatIdx); // Fetch (from data) the last input index where a match was attempted. - U_ASSERT(opValue>=0 && opValue+1fDataSize); - int64_t &lbStartIdx = fData[opValue+2]; + U_ASSERT(opValue>=0 && opValue+4fDataSize); + int64_t &lbStartIdx = fData[opValue+4]; if (lbStartIdx < 0) { // First time through loop. lbStartIdx = fp->fInputIdx - minML; @@ -5561,10 +5573,10 @@ GC_Done: // We have tried all potential match starting points without // getting a match, which means that the negative lookbehind as // a whole has succeeded. Jump forward to the continue location - int64_t restoreInputLen = fData[opValue+3]; - U_ASSERT(restoreInputLen >= fActiveLimit); - U_ASSERT(restoreInputLen <= fInputLength); - fActiveLimit = restoreInputLen; + fActiveStart = fData[opValue+2]; + fActiveLimit = fData[opValue+3]; + U_ASSERT(fActiveStart >= 0); + U_ASSERT(fActiveLimit <= fInputLength); fp->fPatIdx = continueLoc; break; } @@ -5579,7 +5591,7 @@ GC_Done: case URX_LBN_END: // End of a negative look-behind block, after a successful match. { - U_ASSERT(opValue>=0 && opValue+1fDataSize); + U_ASSERT(opValue>=0 && opValue+4fDataSize); if (fp->fInputIdx != fActiveLimit) { // The look-behind expression matched, but the match did not // extend all the way to the point that we are looking behind from. @@ -5596,10 +5608,10 @@ GC_Done: // Restore the orignal input string length, which had been truncated // inorder to pin the end of the lookbehind match // to the position being looked-behind. - int64_t originalInputLen = fData[opValue+3]; - U_ASSERT(originalInputLen >= fActiveLimit); - U_ASSERT(originalInputLen <= fInputLength); - fActiveLimit = originalInputLen; + fActiveStart = fData[opValue+2]; + fActiveLimit = fData[opValue+3]; + U_ASSERT(fActiveStart >= 0); + U_ASSERT(fActiveLimit <= fInputLength); // Restore original stack position, discarding any state saved // by the successful pattern match. diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index db660488a95..b6391d2fa28 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -3525,11 +3525,16 @@ void RegexTest::regex_find(const UnicodeString &pattern, } } parseMatcher->appendTail(deTaggedInput); - REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line); + + if (groupStarts.size() != groupEnds.size()) { + errln("Error at line %d: mismatched group tags in expected results.", line); + failed = true; + goto cleanupAndReturn; + } if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) { - errln("mismatched tags"); - failed = TRUE; - goto cleanupAndReturn; + errln("mismatched tags"); + failed = TRUE; + goto cleanupAndReturn; } // diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt index 759a1963a37..91356123c5b 100644 --- a/icu4c/source/test/testdata/regextst.txt +++ b/icu4c/source/test/testdata/regextst.txt @@ -88,6 +88,28 @@ "abc(?!def)" b "abcdef" "abc(?!def)" b "<0>abcxyz" +# +# Nested Lookahead / Behind +# +"one(?=(?:(?!).)*)" "<0>one stuff" +"one(?=(?:(?!).)*)" "one " + +# More nesting lookaround: pattern matches "qq" when not preceded by 'a' and followed by 'z' +"(?qqc" +"(?qqc" +"(?A<0>jk<2>B" +"(?=(?<=(\p{Lu})(?=..(\p{Lu})))).." "ajkB" +"(?=(?<=(\p{Lu})(?=..(\p{Lu})))).." "Ajkb" + +# Nested lookaround cases from bug ICU-20564 +"(?<=(?<=((?=)){0}+))" "<0>abc" +"(?<=c(?<=c((?=c)){1}+))" "c<0><1>cc" + # # Anchoring Bounds # @@ -1456,11 +1478,14 @@ "abc(?=de(?=f))...g" "<0>abcdefg" "abc(?=de(?=f))...g" "abcdxfg" +# Bug ICU-20618 Assertion failure with nested look-around expressions. +# +"(?<=(?<=b?(?=a)))" "hello, world." + # Random debugging, Temporary # - # # Regexps from http://www.regexlib.com #