diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index 2ac1011e461..b4d35160439 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -525,6 +525,7 @@ void RegexCompile::compile( // Optimization passes // matchStartType(); + OptDotStar(); stripNOPs(); OptEndingLoop(); @@ -3195,18 +3196,30 @@ void RegexCompile::OptEndingLoop() { //---------------------------------------------------------------------------------------- // -// OptDotStar Optimize patterns that end with a '.*' to -// just advance the input to the end without further todo. +// OptDotStar Optimize patterns that end with a '.*' or '.+' to +// just advance the input to the end. +// +// Transform this compiled sequence +// [DOT_ANY | DOT_ANY_ALL] +// JMP_SAV to previous instruction +// [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]* +// END +// +// To +// NOP +// [DOT_ANY_PL | DOT_ANY_ALL_PL] +// [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]* +// END // //---------------------------------------------------------------------------------------- void RegexCompile::OptDotStar() { // Scan backwards in the pattern, looking for a JMP_SAV near the end. - int32_t jmp_loc; + int32_t jmpLoc; int32_t op; int32_t opType; - for (jmp_loc=fRXPat->fCompiledPat->size(); jmp_loc--;) { - U_ASSERT(jmp_loc>0); - op = fRXPat->fCompiledPat->elementAti(jmp_loc); + for (jmpLoc=fRXPat->fCompiledPat->size(); jmpLoc--;) { + U_ASSERT(jmpLoc>0); + op = fRXPat->fCompiledPat->elementAti(jmpLoc); opType = URX_TYPE(op); switch(opType) { @@ -3214,6 +3227,9 @@ void RegexCompile::OptDotStar() { case URX_END: case URX_NOP: case URX_END_CAPTURE: + case URX_DOLLAR_M: + case URX_DOLLAR: + case URX_BACKSLASH_Z: // These ops may follow the JMP_SAV without preventing us from // doing this optimization. continue; @@ -3230,47 +3246,31 @@ void RegexCompile::OptDotStar() { } // We found in URX_JMP_SAV near the end that is a candidate for optimizing. - // Scan the body of the loop for anything that prevents the optimization, - // which is anything that does a state save, or anything that - // alters the current stack frame (like a capture start/end) + // Is the target address the previous instruction? + // Is the previous instruction a flavor of URX_DOTANY int32_t loopTopLoc = URX_VAL(op); - U_ASSERT(loopTopLoc > 1 && loopTopLoc < jmp_loc); - int32_t loc; - for (loc=loopTopLoc; locfCompiledPat->elementAti(loc); - opType = URX_TYPE(op); - switch(opType) { - - case URX_STATE_SAVE: - case URX_JMP_SAV: - case URX_JMP_SAV_X: - case URX_CTR_INIT: - case URX_CTR_INIT_NG: - case URX_CTR_LOOP: - case URX_CTR_LOOP_NG: - case URX_LD_SP: - case URX_END_CAPTURE: - case URX_START_CAPTURE: - // These ops do a state save. - // Can not do the optimization. - return; - - default: - // Other ops within the loop are OK. - ;// keep looking. - } + if (loopTopLoc != jmpLoc-1) { + return; + } + int32_t newOp; + int32_t oldOp = fRXPat->fCompiledPat->elementAti(loopTopLoc); + int32_t oldOpType = opType = URX_TYPE(oldOp); + if (oldOpType == URX_DOTANY) { + newOp = URX_BUILD(URX_DOTANY_PL, 0); + } + else if (oldOpType == URX_DOTANY_ALL) { + newOp = URX_BUILD(URX_DOTANY_ALL_PL, 0); + } else { + return; // Sequence we were looking for isn't there. } - // Everything checks out. We can do the optimization. - insertOp(jmp_loc); // Make space for the extra operand word 0f URX_JMP_SAV_X - op = URX_BUILD(URX_JMP_SAV_X, loopTopLoc); - fRXPat->fCompiledPat->setElementAt(op, jmp_loc); - - int32_t dataLoc = fRXPat->fDataSize; - fRXPat->fDataSize += 1; - fRXPat->fCompiledPat->setElementAt(dataLoc, jmp_loc+1); + // Substitute the new instructions into the pattern. + // The NOP will be removed in a later optimization step. + fRXPat->fCompiledPat->setElementAt(URX_BUILD(URX_NOP, 0), loopTopLoc); + fRXPat->fCompiledPat->setElementAt(newOp, jmpLoc); } + //---------------------------------------------------------------------------------------- // // Error Report a rule parse error. diff --git a/icu4c/source/i18n/regeximp.h b/icu4c/source/i18n/regeximp.h index 27a2cf878be..7c21d321dd1 100644 --- a/icu4c/source/i18n/regeximp.h +++ b/icu4c/source/i18n/regeximp.h @@ -162,7 +162,7 @@ enum { // Used for debug printing only. #define URX_OPCODE_NAMES \ " ", \ - "URX_BACKTRACK", \ + "BACKTRACK", \ "END", \ "ONECHAR", \ "STRING", \ @@ -176,22 +176,22 @@ enum { "DOTANY", \ "JMP", \ "FAIL", \ - "URX_JMP_SAV", \ - "URX_BACKSLASH_B", \ - "URX_BACKSLASH_G", \ - "URX_JMP_SAV_X", \ - "URX_BACKSLASH_X", \ - "URX_BACKSLASH_Z", \ - "URX_DOTANY_ALL", \ - "URX_BACKSLASH_D", \ - "URX_CARET", \ - "URX_DOLLAR", \ + "JMP_SAV", \ + "BACKSLASH_B", \ + "BACKSLASH_G", \ + "JMP_SAV_X", \ + "BACKSLASH_X", \ + "BACKSLASH_Z", \ + "DOTANY_ALL", \ + "BACKSLASH_D", \ + "CARET", \ + "DOLLAR", \ "CTR_INIT", \ "CTR_INIT_NG", \ - "CTR_UNUSED_2", \ + "DOTANY_PL", \ "CTR_LOOP", \ "CTR_LOOP_NG", \ - "CTR_UNUSED_3", \ + "DOTANY_ALL_PL", \ "RELOC_OPRND", \ "STO_SP", \ "LD_SP", \ diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt index 9fc2c7a873e..a3a5bfcf387 100644 --- a/icu4c/source/test/testdata/regextst.txt +++ b/icu4c/source/test/testdata/regextst.txt @@ -280,6 +280,27 @@ "$" "abc<0>" +# +# Optimizations of .* at end of patterns +# +"abc.*" "<0>abcdef" +"abc.*$" "<0>abcdef" +"abc(.*)" "<0>abc<1>def" +"abc(.*)" "<0>abc<1>" +"abc.*" "<0>abc\ndef" +"abc.*" s "<0>abc\ndef" +"abc.*$" s "<0>abc\ndef" +"abc.*$" "abc\ndef" +"abc.*$" m "<0>abc\ndef" +"abc.*\Z" m "abc\ndef" +"abc.*\Z" sm "<0>abc\ndef" + +"abc*" "<0>abcccd" +"abc*$" "<0>abccc" +"ab(?:ab[xyz]\s)*" "<0>ababy abx abc" + +"(?:abc|a)(?:bc)+" "<0>abc" + # # Random debugging, Temporary #