diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index b3fa5352dcb..2713429fe4d 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -399,8 +399,24 @@ UBool RegexCompile::doParseActions(EParseAction action) break; case doOpenNonCaptureParen: - // Open Paren. - break; + // Open non-caputuring (grouping only) Paren. + // Compile to a + // - NOP, which later may be replaced by a save-state if the + // parenthesized group gets a * quantifier, followed by + // - NOP, which may later be replaced by a save-state if there + // is an '|' alternation within the parens. + { + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); + + // On the Parentheses stack, start a new frame and add the postions + // of the two NOPs. + fParenStack.push(-1, *fStatus); // Begin a new frame. + fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP + fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP + } + break; + case doOpenAtomicParen: // Open Paren. @@ -473,6 +489,19 @@ UBool RegexCompile::doParseActions(EParseAction action) } break; + case doNGPlus: + // Non-greedy '+?' compiles to + // 1. stuff to be repeated (already built) + // 2. state-save 1 + // 3. ... + { + int32_t topLoc = blockTopLoc(FALSE); + int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); + fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); + } + break; + + case doOpt: // Normal (greedy) ? quantifier. // Compiles to @@ -481,12 +510,21 @@ UBool RegexCompile::doParseActions(EParseAction action) // 3. ... // Insert the state save into the compiled pattern, and we're done. { - int32_t saveStateLoc = blockTopLoc(); + int32_t saveStateLoc = blockTopLoc(TRUE); int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()); fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); } break; + case doNGOpt: + // Non-greedy ?? quantifier + // compiles to + // 1. jmp 4 + // 2. body of optional stuff + // 3 jmp 5 + // 4. state save 2 + // 5 ... + case doStar: @@ -499,7 +537,7 @@ UBool RegexCompile::doParseActions(EParseAction action) // { // location of item #1, the STATE_SAVE - int32_t saveStateLoc = blockTopLoc(); + int32_t saveStateLoc = blockTopLoc(TRUE); // Locate the position in the compiled pattern where the match will continue // after completing the *. (4 in the comment above) @@ -516,6 +554,23 @@ UBool RegexCompile::doParseActions(EParseAction action) } break; + case doNGStar: + // Non-greedy *? quantifier + // compiles to + // 1. JMP 3 + // 2. body of stuff being iterated over + // 3. STATE_SAVE 2 + // 4 ... + { + int32_t jmpLoc = blockTopLoc(TRUE); // loc 1. + int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3. + int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); + int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); + fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); + fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); + } + break; + case doStartString: // We've just scanned a single "normal" character from the pattern, @@ -614,10 +669,41 @@ UBool RegexCompile::doParseActions(EParseAction action) case doBackslashA: - // Scanned a "\A". fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus); break; + case doBackslashB: + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 1), *fStatus); + break; + + case doBackslashb: + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 0), *fStatus); + break; + + case doBackslashG: + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus); + break; + + case doBackslashW: + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_W, 1), *fStatus); + break; + + case doBackslashw: + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_W, 0), *fStatus); + break; + + case doBackslashX: + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus); + break; + + case doBackslashZ: + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 1), *fStatus); + break; + + case doBackslashz: + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus); + break; + case doExit: returnVal = FALSE; break; @@ -674,8 +760,12 @@ UBool RegexCompile::doParseActions(EParseAction action) // is reserved for this purpose. .* or similar don't // and a slot needs to be added. // +// parameter reserveLoc : TRUE - ensure that there is space to add an opcode +// at the returned location. +// FALSE - just return the address, reserve a location there. +// //------------------------------------------------------------------------------ -int32_t RegexCompile::blockTopLoc() { +int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { int32_t theLoc; if (fRXPat->fCompiledPat->size() == fMatchCloseParen) { @@ -690,11 +780,13 @@ int32_t RegexCompile::blockTopLoc() { // No slot for STATE_SAVE was pre-reserved in the compiled code. // We need to make space now. theLoc = fRXPat->fCompiledPat->size()-1; - int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc); - int32_t prevType = URX_TYPE(opAtTheLoc); - U_ASSERT(prevType==URX_ONECHAR || prevType==URX_SETREF || prevType==URX_DOTANY); - int32_t nop = URX_BUILD(URX_NOP, 0); - fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); + if (reserveLoc) { + int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc); + int32_t prevType = URX_TYPE(opAtTheLoc); + U_ASSERT(prevType==URX_ONECHAR || prevType==URX_SETREF || prevType==URX_DOTANY); + int32_t nop = URX_BUILD(URX_NOP, 0); + fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); + } } return theLoc; } diff --git a/icu4c/source/i18n/regexcmp.h b/icu4c/source/i18n/regexcmp.h index 0416a0533bf..31166c3c5a0 100644 --- a/icu4c/source/i18n/regexcmp.h +++ b/icu4c/source/i18n/regexcmp.h @@ -89,9 +89,10 @@ private: UChar32 peekCharLL(); UnicodeSet *scanSet(); void handleCloseParen(); - int32_t blockTopLoc(); // Locate a position in the compiled pattern + int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern // at the top of the just completed block - // or operation. + // or operation, and optionally ensure that + // there is space to add an opcode there. UErrorCode *fStatus; diff --git a/icu4c/source/i18n/regexcst.h b/icu4c/source/i18n/regexcst.h index cd68bc72a1c..df4db1b5529 100644 --- a/icu4c/source/i18n/regexcst.h +++ b/icu4c/source/i18n/regexcst.h @@ -31,9 +31,11 @@ enum Regex_PatternParseAction { doRuleError, doStartString, doNGOpt, + doBackslashw, doPossesiveStar, doOpenLookBehind, doExprRParen, + doBackslashz, doStar, doPossesivePlus, doNGStar, @@ -41,21 +43,27 @@ enum Regex_PatternParseAction { doPlus, doOpenNonCaptureParen, doBackslashA, + doBackslashB, doNGPlus, doPatFinish, doIntervalMinValue, doIntervalDigit, doPossesiveOpt, + doBackslashG, doOpt, doOpenAtomicParen, doStringChar, doOpenLookAhead, doNumberExpectedError, doDotAny, + doBackslashW, + doBackslashX, doScanUnicodeSet, + doBackslashZ, doNOP, doExit, doPatStart, + doBackslashb, doEndString, doOpenLookBehindNeg, doSplitString, @@ -87,7 +95,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doDotAny, 46 /* . */, 18,0, TRUE} // 7 , {doNOP, 92 /* \ */, 59,0, TRUE} // 8 , {doNOP, 253, 2,0, FALSE} // 9 - , {doRuleError, 255, 61,0, FALSE} // 10 + , {doRuleError, 255, 69,0, FALSE} // 10 , {doStringChar, 254, 11,0, TRUE} // 11 string , {doStringChar, 130, 11,0, TRUE} // 12 , {doSplitString, 63 /* ? */, 18,0, FALSE} // 13 @@ -109,10 +117,10 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doOpenLookAhead, 61 /* = */, 3, 22, TRUE} // 29 , {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE} // 30 , {doNOP, 60 /* < */, 33,0, TRUE} // 31 - , {doBadOpenParenType, 255, 61,0, FALSE} // 32 + , {doBadOpenParenType, 255, 69,0, FALSE} // 32 , {doOpenLookBehind, 61 /* = */, 3, 22, TRUE} // 33 open-paren-lookbehind , {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE} // 34 - , {doBadOpenParenType, 255, 61,0, FALSE} // 35 + , {doBadOpenParenType, 255, 69,0, FALSE} // 35 , {doNGStar, 63 /* ? */, 22,0, TRUE} // 36 quant-star , {doPossesiveStar, 43 /* + */, 22,0, TRUE} // 37 , {doStar, 255, 22,0, FALSE} // 38 @@ -124,21 +132,29 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doOpt, 255, 22,0, FALSE} // 44 , {doNOP, 129, 45,0, TRUE} // 45 interval-open , {doIntervalMinValue, 128, 48,0, FALSE} // 46 - , {doNumberExpectedError, 255, 61,0, FALSE} // 47 + , {doNumberExpectedError, 255, 69,0, FALSE} // 47 , {doNOP, 129, 52,0, TRUE} // 48 interval-value , {doNOP, 125 /* } */, 52,0, FALSE} // 49 , {doIntervalDigit, 128, 48,0, TRUE} // 50 - , {doNumberExpectedError, 255, 61,0, FALSE} // 51 + , {doNumberExpectedError, 255, 69,0, FALSE} // 51 , {doNOP, 129, 52,0, TRUE} // 52 interval-close , {doTagValue, 125 /* } */, 55,0, TRUE} // 53 - , {doNumberExpectedError, 255, 61,0, FALSE} // 54 + , {doNumberExpectedError, 255, 69,0, FALSE} // 54 , {doNOP, 254, 3,0, FALSE} // 55 expr-cont-no-interval , {doExprOrOperator, 124 /* | */, 3,0, TRUE} // 56 , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57 , {doNOP, 255, 3,0, FALSE} // 58 , {doBackslashA, 65 /* A */, 3,0, TRUE} // 59 backslash - , {doStartString, 255, 11,0, TRUE} // 60 - , {doExit, 255, 61,0, TRUE} // 61 errorDeath + , {doBackslashB, 66 /* B */, 3,0, TRUE} // 60 + , {doBackslashb, 98 /* b */, 3,0, TRUE} // 61 + , {doBackslashG, 71 /* G */, 3,0, TRUE} // 62 + , {doBackslashW, 87 /* W */, 3,0, TRUE} // 63 + , {doBackslashw, 119 /* w */, 3,0, TRUE} // 64 + , {doBackslashX, 88 /* X */, 3,0, TRUE} // 65 + , {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 66 + , {doBackslashz, 122 /* z */, 3,0, TRUE} // 67 + , {doStartString, 255, 11,0, TRUE} // 68 + , {doExit, 255, 69,0, TRUE} // 69 errorDeath }; static const char *RegexStateNames[] = { 0, "start", @@ -200,6 +216,14 @@ static const char *RegexStateNames[] = { 0, 0, 0, "backslash", + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, 0, "errorDeath", 0}; diff --git a/icu4c/source/i18n/regexcst.txt b/icu4c/source/i18n/regexcst.txt index 11b92be501b..9d5646db05d 100644 --- a/icu4c/source/i18n/regexcst.txt +++ b/icu4c/source/i18n/regexcst.txt @@ -212,6 +212,15 @@ expr-cont-no-interval: # some of them already; those won't come here. backslash: 'A' n term doBackslashA + 'B' n term doBackslashB + 'b' n term doBackslashb + 'G' n term doBackslashG + 'W' n term doBackslashW + 'w' n term doBackslashw + 'X' n term doBackslashX + 'Z' n term doBackslashZ + 'z' n term doBackslashz + default n string doStartString diff --git a/icu4c/source/i18n/regeximp.h b/icu4c/source/i18n/regeximp.h index 900b7b9b68d..0e7501321d7 100644 --- a/icu4c/source/i18n/regeximp.h +++ b/icu4c/source/i18n/regeximp.h @@ -26,14 +26,21 @@ static const uint32_t URX_STATE_SAVE = 6; // Value field is pattern po static const uint32_t URX_NOP = 7; static const uint32_t URX_START_CAPTURE = 8; // Value field is capture group number. static const uint32_t URX_END_CAPTURE = 9; // Value field is capture group number -static const uint32_t URX_BACKSLASH_A = 10; // Value field is index in pattern to - // loop back to. +static const uint32_t URX_UNUSED10 = 10; static const uint32_t URX_SETREF = 11; // Value field is index of set in array of sets. static const uint32_t URX_DOTANY = 12; static const uint32_t URX_JMP = 13; // Value field is destination position in // the pattern. static const uint32_t URX_FAIL = 14; // Stop match operation; No match. +static const uint32_t URX_BACKSLASH_A = 15; +static const uint32_t URX_BACKSLASH_B = 16; // Value field: 0: \b 1: \B +static const uint32_t URX_BACKSLASH_G = 17; +static const uint32_t URX_BACKSLASH_W = 18; // Value field: 0: \w 1: \W +static const uint32_t URX_BACKSLASH_X = 19; +static const uint32_t URX_BACKSLASH_Z = 20; // Value field: 0: \z 1: \Z + + // // Convenience macros for assembling and disassembling a compiled operation. // diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index 5898bfd216d..388182a4041 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -193,12 +193,15 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const { err = U_INDEX_OUTOFBOUNDS_ERROR; return 0; } - int32_t e = 0; + int32_t e = -1; if (group == 0) { e = fMatchEnd; } else { - int32_t s = fCaptureEnds->elementAti(group); - // TODO: what to do if no match on this specific group? + // Note: When the match engine backs out of a capture group, it sets the + // group's start position to -1. The end position is left with junk. + // So, before returning an end position, we must first check that + // the start position indicates that the group matched something. + int32_t s = fCaptureStarts->elementAti(group); if (s != -1) { e = fCaptureEnds->elementAti(group); } @@ -457,10 +460,11 @@ void RegexMatcher::backTrack(int32_t &inputIdx, int32_t &patIdx) { inputIdx = fBackTrackStack->popi(); patIdx = fBackTrackStack->popi(); int i; - for (i=0; ifNumCaptureGroups; i++) { - if (fCaptureStarts->elementAti(i) >= inputIdx) { - fCaptureStarts->setElementAt(i, -1); - } + for (i=1; i<=fPattern->fNumCaptureGroups; i++) { + int32_t cge = fBackTrackStack->popi(); + fCaptureEnds->setElementAt(cge, i); + int32_t cgs = fBackTrackStack->popi(); + fCaptureStarts->setElementAt(cgs, i); } } @@ -554,10 +558,17 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { case URX_STATE_SAVE: - // When saving state for backtracking, the pattern position that a - // backtrack should (eventually) continue at is "opValue". - fBackTrackStack->push(opValue, status); - fBackTrackStack->push(inputIdx, status); + // Save the state of all capture groups, the pattern continuation + // postion and the input position. + { + int i; + for (i=fPattern->fNumCaptureGroups; i>0; i--) { + fBackTrackStack->push(fCaptureStarts->elementAt(i), status); + fBackTrackStack->push(fCaptureEnds->elementAt(i), status); + } + fBackTrackStack->push(opValue, status); // pattern continuation position + fBackTrackStack->push(inputIdx, status); // current input position + } break; @@ -579,12 +590,44 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { fCaptureEnds->setElementAt(inputIdx, opValue); break; - case URX_BACKSLASH_A: + case URX_BACKSLASH_A: // Test for start of input if (inputIdx != 0) { backTrack(inputIdx, patIdx); } break; + case URX_BACKSLASH_B: // Test for word boundaries + if (FALSE) { + backTrack(inputIdx, patIdx); + } + break; + + + case URX_BACKSLASH_G: // Test for position at end of previous match + if (FALSE) { + backTrack(inputIdx, patIdx); + } + break; + + case URX_BACKSLASH_W: // Match word chars (TODO: doesn't belong here? + if (FALSE) { + backTrack(inputIdx, patIdx); + } + break; + + case URX_BACKSLASH_X: // Match combining character sequence + if (FALSE) { + backTrack(inputIdx, patIdx); + } + break; + + case URX_BACKSLASH_Z: // Test for end of line + if (FALSE) { + backTrack(inputIdx, patIdx); + } + break; + + case URX_SETREF: if (inputIdx < fInputLength) { diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp index 406be3aff68..38d4c97a576 100644 --- a/icu4c/source/i18n/repattrn.cpp +++ b/icu4c/source/i18n/repattrn.cpp @@ -411,11 +411,17 @@ static char *opNames[] = { "NOP", "START_CAPTURE", "END_CAPTURE", - "URX_BACKSLASH_A", + "UNUSED10", "SETREF", "DOTANY", "JMP", - "FAIL" + "FAIL", + "URX_BACKSLASH_A", + "URX_BACKSLASH_B", + "URX_BACKSLASH_G", + "URX_BACKSLASH_W", + "URX_BACKSLASH_X", + "URX_BACKSLASH_Z" }; void RegexPattern::dump() { @@ -451,6 +457,9 @@ void RegexPattern::dump() { case URX_NOP: case URX_DOTANY: case URX_FAIL: + case URX_BACKSLASH_A: + case URX_BACKSLASH_G: + case URX_BACKSLASH_X: // Types with no operand field of interest. break; @@ -459,6 +468,9 @@ void RegexPattern::dump() { case URX_SETREF: case URX_STATE_SAVE: case URX_JMP: + case URX_BACKSLASH_B: + case URX_BACKSLASH_W: + case URX_BACKSLASH_Z: // types with an integer operand field. printf("%d", val); break; diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index b68f1d0981f..fa3b770a9ec 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -5,16 +5,23 @@ ********************************************************************/ // -// regex.cpp +// regextst.cpp // // ICU Regular Expressions test, part of intltest. // #include "unicode/utypes.h" +#include "unicode/uchar.h" #include "intltest.h" #include "regextst.h" +#include "uvector.h" +//--------------------------------------------------------------------------- +// +// Test class boilerplate +// +//--------------------------------------------------------------------------- RegexTest::RegexTest() { }; @@ -43,12 +50,36 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch case 3: name = "API_Pattern"; if (exec) API_Pattern(); break; + case 4: name = "Extended"; + if (exec) Extended(); + break; default: name = ""; break; //needed to end loop } } +//--------------------------------------------------------------------------- +// +// Error Checking / Reporting macros used in all of the tests. +// +//--------------------------------------------------------------------------- +#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%d\n", \ +__LINE__, status); return;}} + +#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};} + +#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\ +if (status!=errcode) {errln("RegexTest failure at line %d.\n", __LINE__);};} + +#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \ + "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }} + +#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ + errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} + + + //--------------------------------------------------------------------------- // // REGEX_TESTLM Macro + invocation function to simplify writing quick tests @@ -62,13 +93,6 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch // // //--------------------------------------------------------------------------- -#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%d\n", \ -__LINE__, status); return;}} - -#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};} - -#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\ -if (status!=errcode) {errln("RegexTest failure at line %d.\n", __LINE__);};} #define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__); @@ -129,9 +153,156 @@ UBool RegexTest::doRegexLMTest(char *pat, char *text, UBool looking, UBool match } + + //--------------------------------------------------------------------------- // -// API_Match +// REGEX_FIND Macro + invocation function to simplify writing tests +// regex tests. +// +// usage: +// REGEX_FIND("pattern", "input text"); +// REGEX_FIND_S("pattern", "input text", expected status); +// +// The input text is unescaped. The pattern is not. +// The input text is marked with the expected match positions +// <0>text <1> more text +// The tags are removed before trying the match. +// The tags mark the start and end of the match and of any capture groups. +// +// +//--------------------------------------------------------------------------- + +// REGEX_FIND is invoked via a macro, which allows capturing the source file line +// number for use in error messages. +#define REGEX_FIND(pat, text) regex_find(pat, text, U_ZERO_ERROR, __LINE__); +#define REGEX_FIND_S(pat, text, status) regex_find(pat, text, status, __LINE__); + + +// Set a value into a UVector at position specified by a decimal number in +// a UnicodeString. This is a utility function needed by the actual test function, +// which follows. +void set(UVector &vec, int val, UnicodeString index) { + UErrorCode status=U_ZERO_ERROR; + int idx = 0; + for (int i=0; i", 0, pe, status); + REGEX_CHECK_STATUS_L(line); + + unEscapedInput = inputString.unescape(); + parseMatcher = parsePat->matcher(unEscapedInput, status); + REGEX_CHECK_STATUS_L(line); + while(parseMatcher->find()) { + parseMatcher->appendReplacement(deTaggedInput, "", status); + REGEX_CHECK_STATUS; + UnicodeString groupNum = parseMatcher->group(2, status); + if (parseMatcher->group(1, status) == "/") { + // close tag + set(groupEnds, deTaggedInput.length(), groupNum); + } else { + set(groupStarts, deTaggedInput.length(), groupNum); + } + } + parseMatcher->appendTail(deTaggedInput); + REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line); + + + // + // Do a find on the de-tagged input using the caller's pattern + // + matcher = callerPattern->matcher(deTaggedInput, status); + REGEX_CHECK_STATUS_L(line); + isMatch = matcher->find(); + + // + // Match up the groups from the find() with the groups from the tags + // + + // number of tags should match number of groups from find operation. + // matcher->groupCount does not include group 0, the entire match, hence the +1. + if (isMatch == FALSE && groupStarts.size() != 0) { + errln("Error at line %d: Match expected, but none found.\n", line); + goto cleanupAndReturn; + } + int i; + for (i=0; i<=matcher->groupCount(); i++) { + int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i)); + if (matcher->start(i, status) != expectedStart) { + errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d", + line, i, expectedStart, matcher->start(i, status)); + failed = TRUE; + goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. + } + int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i)); + if (matcher->end(i, status) != expectedEnd) { + errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d", + line, i, expectedEnd, matcher->end(i, status)); + failed = TRUE; + // Error on end position; keep going; real error is probably yet to come as group + // end positions work from end of the input data towards the front. + } + } + if ( matcher->groupCount()+1 < groupStarts.size()) { + errln("Error at line %d: Expected %d capture groups, found %d.", + line, groupStarts.size()-1, matcher->groupCount()); + failed = TRUE; + } + +cleanupAndReturn: + if (failed) { + callerPattern->dump(); + } + delete parseMatcher; + delete parsePat; + delete matcher; + delete callerPattern; +} + + +//--------------------------------------------------------------------------- +// +// API_Match Test that the API for class RegexMatcher +// is present and nominally working, but excluding functions +// implementing replace operations. // //--------------------------------------------------------------------------- void RegexTest::API_Match() { @@ -388,6 +559,13 @@ void RegexTest::API_Match() { REGEX_CHECK_STATUS; REGEX_ASSERT(dest == "bcbcdefg"); + // TODO: need more through testing of capture substitutions. + + + // + // Non-Grouping parentheses + // + } @@ -396,10 +574,13 @@ void RegexTest::API_Match() { + + //--------------------------------------------------------------------------- // -// Basic Check for basic functionality of -// regex pattern matching. +// Basic Check for basic functionality of regex pattern matching. +// Avoid the use of REGEX_FIND test macro, which has +// substantial dependencies on basic Regex functionality. // //--------------------------------------------------------------------------- void RegexTest::Basic() { @@ -536,22 +717,125 @@ void RegexTest::Basic() { REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input + // Escape of special chars in patterns + REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE); + + }; //--------------------------------------------------------------------------- // -// API_Replace +// API_Replace API test for class RegexMatcher, testing the +// Replace family of functions. // //--------------------------------------------------------------------------- void RegexTest::API_Replace() { + // + // Replace + // + int32_t flags=0; + UParseError pe; + UErrorCode status=U_ZERO_ERROR; + + UnicodeString re("abc"); + RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); + REGEX_CHECK_STATUS; + UnicodeString data = ".abc..abc...abc.."; + // 012345678901234567 + RegexMatcher *matcher = pat->matcher(data, status); + + // + // Plain vanilla matches. + // + UnicodeString dest; + dest = matcher->replaceFirst("yz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == ".yz..abc...abc.."); + + dest = matcher->replaceAll("yz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == ".yz..yz...yz.."); + + // + // Plain vanilla non-matches. + // + UnicodeString d2 = ".abx..abx...abx.."; + matcher->reset(d2); + dest = matcher->replaceFirst("yz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == ".abx..abx...abx.."); + + dest = matcher->replaceAll("yz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == ".abx..abx...abx.."); + + // + // Empty source string + // + UnicodeString d3 = ""; + matcher->reset(d3); + dest = matcher->replaceFirst("yz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == ""); + + dest = matcher->replaceAll("yz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == ""); + + // + // Empty substitution string + // + matcher->reset(data); // ".abc..abc...abc.." + dest = matcher->replaceFirst("", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == "...abc...abc.."); + + dest = matcher->replaceAll("", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == "........"); + + // + // match whole string + // + UnicodeString d4 = "abc"; + matcher->reset(d4); + dest = matcher->replaceFirst("xyz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == "xyz"); + + dest = matcher->replaceAll("xyz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == "xyz"); + + // + // Capture Group, simple case + // + UnicodeString re2("a(..)"); + RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status); + REGEX_CHECK_STATUS; + UnicodeString d5 = "abcdefg"; + RegexMatcher *matcher2 = pat2->matcher(d5, status); + REGEX_CHECK_STATUS; + dest = matcher2->replaceFirst("$1$1", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == "bcbcdefg"); + + // TODO: need more through testing of capture substitutions. + + + // + // Non-Grouping parentheses + // + } //--------------------------------------------------------------------------- // -// API_Pattern +// API_Pattern Test that the API for class RegexPattern is +// present and nominally working. // //--------------------------------------------------------------------------- void RegexTest::API_Pattern() { @@ -688,6 +972,36 @@ void RegexTest::API_Pattern() { +//--------------------------------------------------------------------------- +// +// Extended A more thorough check for features of regex patterns +// +//--------------------------------------------------------------------------- +void RegexTest::Extended() { + // Capturing parens + REGEX_FIND(".(..).", "<0>a<1>bcd"); + REGEX_FIND(".*\\A( +hello)", "<0><1> hello"); + REGEX_FIND("(hello)|(goodbye)", "<0><1>hello"); + REGEX_FIND("(hello)|(goodbye)", "<0><2>goodbye"); + REGEX_FIND("abc( +( inner(X?) +) xyz)", "leading cruft <0>abc<1> <2> inner<3> xyz cruft"); + + // Non-capturing parens (?: stuff). Groups, but does not capture. + REGEX_FIND("(?:abc)*(tail)", "<0>abcabcabc<1>tail"); + + // Non-greedy *? quantifier + REGEX_FIND(".*?(abc)", "<0> abx <1>abc abc abc abc"); + REGEX_FIND(".*(abc)", "<0> abx abc abc abc <1>abc"); + + REGEX_FIND( "((?:abc |xyz )*?)abc ", "<0><1>xyz abc abc abc "); + REGEX_FIND( "((?:abc |xyz )*)abc ", "<0><1>xyz abc abc abc "); + + // Non-greedy +? quantifier + REGEX_FIND( "(a+?)(a*)", "<0><1>a<2>aaaaaaaaaaaa"); + REGEX_FIND( "(a+)(a*)", "<0><1>aaaaaaaaaaaaa<2>"); + + REGEX_FIND( "((ab)+?)((ab)*)", "<0><1><2>ab<3>ababababab<4>ab"); + REGEX_FIND( "((ab)+)((ab)*)", "<0><1>abababababab<2>ab<3>"); +} diff --git a/icu4c/source/test/intltest/regextst.h b/icu4c/source/test/intltest/regextst.h index 8d198ae6118..2c4be583093 100644 --- a/icu4c/source/test/intltest/regextst.h +++ b/icu4c/source/test/intltest/regextst.h @@ -25,8 +25,9 @@ public: virtual void API_Pattern(); virtual void API_Replace(); virtual void Basic(); + virtual void Extended(); virtual UBool doRegexLMTest(char *pat, char *text, UBool looking, UBool match, int line); - + virtual void regex_find(char *pat, char *input, UErrorCode expectedStatus, int line); }; #endif