ICU-105 Regular Expressions, ongoing development

X-SVN-Rev: 10069
This commit is contained in:
Andy Heninger 2002-10-28 17:18:44 +00:00
parent 65d107bf3d
commit 00767a816c
9 changed files with 554 additions and 51 deletions

View file

@ -399,8 +399,24 @@ UBool RegexCompile::doParseActions(EParseAction action)
break;
case doOpenNonCaptureParen:
// Open Paren.
break;
// Open non-caputuring (grouping only) Paren.
// Compile to a
// - NOP, which later may be replaced by a save-state if the
// parenthesized group gets a * quantifier, followed by
// - NOP, which may later be replaced by a save-state if there
// is an '|' alternation within the parens.
{
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
// On the Parentheses stack, start a new frame and add the postions
// of the two NOPs.
fParenStack.push(-1, *fStatus); // Begin a new frame.
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
}
break;
case doOpenAtomicParen:
// Open Paren.
@ -473,6 +489,19 @@ UBool RegexCompile::doParseActions(EParseAction action)
}
break;
case doNGPlus:
// Non-greedy '+?' compiles to
// 1. stuff to be repeated (already built)
// 2. state-save 1
// 3. ...
{
int32_t topLoc = blockTopLoc(FALSE);
int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
}
break;
case doOpt:
// Normal (greedy) ? quantifier.
// Compiles to
@ -481,12 +510,21 @@ UBool RegexCompile::doParseActions(EParseAction action)
// 3. ...
// Insert the state save into the compiled pattern, and we're done.
{
int32_t saveStateLoc = blockTopLoc();
int32_t saveStateLoc = blockTopLoc(TRUE);
int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
}
break;
case doNGOpt:
// Non-greedy ?? quantifier
// compiles to
// 1. jmp 4
// 2. body of optional stuff
// 3 jmp 5
// 4. state save 2
// 5 ...
case doStar:
@ -499,7 +537,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
//
{
// location of item #1, the STATE_SAVE
int32_t saveStateLoc = blockTopLoc();
int32_t saveStateLoc = blockTopLoc(TRUE);
// Locate the position in the compiled pattern where the match will continue
// after completing the *. (4 in the comment above)
@ -516,6 +554,23 @@ UBool RegexCompile::doParseActions(EParseAction action)
}
break;
case doNGStar:
// Non-greedy *? quantifier
// compiles to
// 1. JMP 3
// 2. body of stuff being iterated over
// 3. STATE_SAVE 2
// 4 ...
{
int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc);
int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
}
break;
case doStartString:
// We've just scanned a single "normal" character from the pattern,
@ -614,10 +669,41 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doBackslashA:
// Scanned a "\A".
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus);
break;
case doBackslashB:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 1), *fStatus);
break;
case doBackslashb:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 0), *fStatus);
break;
case doBackslashG:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
break;
case doBackslashW:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_W, 1), *fStatus);
break;
case doBackslashw:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_W, 0), *fStatus);
break;
case doBackslashX:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
break;
case doBackslashZ:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 1), *fStatus);
break;
case doBackslashz:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
break;
case doExit:
returnVal = FALSE;
break;
@ -674,8 +760,12 @@ UBool RegexCompile::doParseActions(EParseAction action)
// is reserved for this purpose. .* or similar don't
// and a slot needs to be added.
//
// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
// at the returned location.
// FALSE - just return the address, reserve a location there.
//
//------------------------------------------------------------------------------
int32_t RegexCompile::blockTopLoc() {
int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
int32_t theLoc;
if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
{
@ -690,11 +780,13 @@ int32_t RegexCompile::blockTopLoc() {
// No slot for STATE_SAVE was pre-reserved in the compiled code.
// We need to make space now.
theLoc = fRXPat->fCompiledPat->size()-1;
int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
int32_t prevType = URX_TYPE(opAtTheLoc);
U_ASSERT(prevType==URX_ONECHAR || prevType==URX_SETREF || prevType==URX_DOTANY);
int32_t nop = URX_BUILD(URX_NOP, 0);
fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
if (reserveLoc) {
int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
int32_t prevType = URX_TYPE(opAtTheLoc);
U_ASSERT(prevType==URX_ONECHAR || prevType==URX_SETREF || prevType==URX_DOTANY);
int32_t nop = URX_BUILD(URX_NOP, 0);
fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
}
}
return theLoc;
}

View file

@ -89,9 +89,10 @@ private:
UChar32 peekCharLL();
UnicodeSet *scanSet();
void handleCloseParen();
int32_t blockTopLoc(); // Locate a position in the compiled pattern
int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern
// at the top of the just completed block
// or operation.
// or operation, and optionally ensure that
// there is space to add an opcode there.
UErrorCode *fStatus;

View file

@ -31,9 +31,11 @@ enum Regex_PatternParseAction {
doRuleError,
doStartString,
doNGOpt,
doBackslashw,
doPossesiveStar,
doOpenLookBehind,
doExprRParen,
doBackslashz,
doStar,
doPossesivePlus,
doNGStar,
@ -41,21 +43,27 @@ enum Regex_PatternParseAction {
doPlus,
doOpenNonCaptureParen,
doBackslashA,
doBackslashB,
doNGPlus,
doPatFinish,
doIntervalMinValue,
doIntervalDigit,
doPossesiveOpt,
doBackslashG,
doOpt,
doOpenAtomicParen,
doStringChar,
doOpenLookAhead,
doNumberExpectedError,
doDotAny,
doBackslashW,
doBackslashX,
doScanUnicodeSet,
doBackslashZ,
doNOP,
doExit,
doPatStart,
doBackslashb,
doEndString,
doOpenLookBehindNeg,
doSplitString,
@ -87,7 +95,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doDotAny, 46 /* . */, 18,0, TRUE} // 7
, {doNOP, 92 /* \ */, 59,0, TRUE} // 8
, {doNOP, 253, 2,0, FALSE} // 9
, {doRuleError, 255, 61,0, FALSE} // 10
, {doRuleError, 255, 69,0, FALSE} // 10
, {doStringChar, 254, 11,0, TRUE} // 11 string
, {doStringChar, 130, 11,0, TRUE} // 12
, {doSplitString, 63 /* ? */, 18,0, FALSE} // 13
@ -109,10 +117,10 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOpenLookAhead, 61 /* = */, 3, 22, TRUE} // 29
, {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE} // 30
, {doNOP, 60 /* < */, 33,0, TRUE} // 31
, {doBadOpenParenType, 255, 61,0, FALSE} // 32
, {doBadOpenParenType, 255, 69,0, FALSE} // 32
, {doOpenLookBehind, 61 /* = */, 3, 22, TRUE} // 33 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE} // 34
, {doBadOpenParenType, 255, 61,0, FALSE} // 35
, {doBadOpenParenType, 255, 69,0, FALSE} // 35
, {doNGStar, 63 /* ? */, 22,0, TRUE} // 36 quant-star
, {doPossesiveStar, 43 /* + */, 22,0, TRUE} // 37
, {doStar, 255, 22,0, FALSE} // 38
@ -124,21 +132,29 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOpt, 255, 22,0, FALSE} // 44
, {doNOP, 129, 45,0, TRUE} // 45 interval-open
, {doIntervalMinValue, 128, 48,0, FALSE} // 46
, {doNumberExpectedError, 255, 61,0, FALSE} // 47
, {doNumberExpectedError, 255, 69,0, FALSE} // 47
, {doNOP, 129, 52,0, TRUE} // 48 interval-value
, {doNOP, 125 /* } */, 52,0, FALSE} // 49
, {doIntervalDigit, 128, 48,0, TRUE} // 50
, {doNumberExpectedError, 255, 61,0, FALSE} // 51
, {doNumberExpectedError, 255, 69,0, FALSE} // 51
, {doNOP, 129, 52,0, TRUE} // 52 interval-close
, {doTagValue, 125 /* } */, 55,0, TRUE} // 53
, {doNumberExpectedError, 255, 61,0, FALSE} // 54
, {doNumberExpectedError, 255, 69,0, FALSE} // 54
, {doNOP, 254, 3,0, FALSE} // 55 expr-cont-no-interval
, {doExprOrOperator, 124 /* | */, 3,0, TRUE} // 56
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57
, {doNOP, 255, 3,0, FALSE} // 58
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 59 backslash
, {doStartString, 255, 11,0, TRUE} // 60
, {doExit, 255, 61,0, TRUE} // 61 errorDeath
, {doBackslashB, 66 /* B */, 3,0, TRUE} // 60
, {doBackslashb, 98 /* b */, 3,0, TRUE} // 61
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 62
, {doBackslashW, 87 /* W */, 3,0, TRUE} // 63
, {doBackslashw, 119 /* w */, 3,0, TRUE} // 64
, {doBackslashX, 88 /* X */, 3,0, TRUE} // 65
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 66
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 67
, {doStartString, 255, 11,0, TRUE} // 68
, {doExit, 255, 69,0, TRUE} // 69 errorDeath
};
static const char *RegexStateNames[] = { 0,
"start",
@ -200,6 +216,14 @@ static const char *RegexStateNames[] = { 0,
0,
0,
"backslash",
0,
0,
0,
0,
0,
0,
0,
0,
0,
"errorDeath",
0};

View file

@ -212,6 +212,15 @@ expr-cont-no-interval:
# some of them already; those won't come here.
backslash:
'A' n term doBackslashA
'B' n term doBackslashB
'b' n term doBackslashb
'G' n term doBackslashG
'W' n term doBackslashW
'w' n term doBackslashw
'X' n term doBackslashX
'Z' n term doBackslashZ
'z' n term doBackslashz
default n string doStartString

View file

@ -26,14 +26,21 @@ static const uint32_t URX_STATE_SAVE = 6; // Value field is pattern po
static const uint32_t URX_NOP = 7;
static const uint32_t URX_START_CAPTURE = 8; // Value field is capture group number.
static const uint32_t URX_END_CAPTURE = 9; // Value field is capture group number
static const uint32_t URX_BACKSLASH_A = 10; // Value field is index in pattern to
// loop back to.
static const uint32_t URX_UNUSED10 = 10;
static const uint32_t URX_SETREF = 11; // Value field is index of set in array of sets.
static const uint32_t URX_DOTANY = 12;
static const uint32_t URX_JMP = 13; // Value field is destination position in
// the pattern.
static const uint32_t URX_FAIL = 14; // Stop match operation; No match.
static const uint32_t URX_BACKSLASH_A = 15;
static const uint32_t URX_BACKSLASH_B = 16; // Value field: 0: \b 1: \B
static const uint32_t URX_BACKSLASH_G = 17;
static const uint32_t URX_BACKSLASH_W = 18; // Value field: 0: \w 1: \W
static const uint32_t URX_BACKSLASH_X = 19;
static const uint32_t URX_BACKSLASH_Z = 20; // Value field: 0: \z 1: \Z
//
// Convenience macros for assembling and disassembling a compiled operation.
//

View file

@ -193,12 +193,15 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
err = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
int32_t e = 0;
int32_t e = -1;
if (group == 0) {
e = fMatchEnd;
} else {
int32_t s = fCaptureEnds->elementAti(group);
// TODO: what to do if no match on this specific group?
// Note: When the match engine backs out of a capture group, it sets the
// group's start position to -1. The end position is left with junk.
// So, before returning an end position, we must first check that
// the start position indicates that the group matched something.
int32_t s = fCaptureStarts->elementAti(group);
if (s != -1) {
e = fCaptureEnds->elementAti(group);
}
@ -457,10 +460,11 @@ void RegexMatcher::backTrack(int32_t &inputIdx, int32_t &patIdx) {
inputIdx = fBackTrackStack->popi();
patIdx = fBackTrackStack->popi();
int i;
for (i=0; i<fPattern->fNumCaptureGroups; i++) {
if (fCaptureStarts->elementAti(i) >= inputIdx) {
fCaptureStarts->setElementAt(i, -1);
}
for (i=1; i<=fPattern->fNumCaptureGroups; i++) {
int32_t cge = fBackTrackStack->popi();
fCaptureEnds->setElementAt(cge, i);
int32_t cgs = fBackTrackStack->popi();
fCaptureStarts->setElementAt(cgs, i);
}
}
@ -554,10 +558,17 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
case URX_STATE_SAVE:
// When saving state for backtracking, the pattern position that a
// backtrack should (eventually) continue at is "opValue".
fBackTrackStack->push(opValue, status);
fBackTrackStack->push(inputIdx, status);
// Save the state of all capture groups, the pattern continuation
// postion and the input position.
{
int i;
for (i=fPattern->fNumCaptureGroups; i>0; i--) {
fBackTrackStack->push(fCaptureStarts->elementAt(i), status);
fBackTrackStack->push(fCaptureEnds->elementAt(i), status);
}
fBackTrackStack->push(opValue, status); // pattern continuation position
fBackTrackStack->push(inputIdx, status); // current input position
}
break;
@ -579,12 +590,44 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
fCaptureEnds->setElementAt(inputIdx, opValue);
break;
case URX_BACKSLASH_A:
case URX_BACKSLASH_A: // Test for start of input
if (inputIdx != 0) {
backTrack(inputIdx, patIdx);
}
break;
case URX_BACKSLASH_B: // Test for word boundaries
if (FALSE) {
backTrack(inputIdx, patIdx);
}
break;
case URX_BACKSLASH_G: // Test for position at end of previous match
if (FALSE) {
backTrack(inputIdx, patIdx);
}
break;
case URX_BACKSLASH_W: // Match word chars (TODO: doesn't belong here?
if (FALSE) {
backTrack(inputIdx, patIdx);
}
break;
case URX_BACKSLASH_X: // Match combining character sequence
if (FALSE) {
backTrack(inputIdx, patIdx);
}
break;
case URX_BACKSLASH_Z: // Test for end of line
if (FALSE) {
backTrack(inputIdx, patIdx);
}
break;
case URX_SETREF:
if (inputIdx < fInputLength) {

View file

@ -411,11 +411,17 @@ static char *opNames[] = {
"NOP",
"START_CAPTURE",
"END_CAPTURE",
"URX_BACKSLASH_A",
"UNUSED10",
"SETREF",
"DOTANY",
"JMP",
"FAIL"
"FAIL",
"URX_BACKSLASH_A",
"URX_BACKSLASH_B",
"URX_BACKSLASH_G",
"URX_BACKSLASH_W",
"URX_BACKSLASH_X",
"URX_BACKSLASH_Z"
};
void RegexPattern::dump() {
@ -451,6 +457,9 @@ void RegexPattern::dump() {
case URX_NOP:
case URX_DOTANY:
case URX_FAIL:
case URX_BACKSLASH_A:
case URX_BACKSLASH_G:
case URX_BACKSLASH_X:
// Types with no operand field of interest.
break;
@ -459,6 +468,9 @@ void RegexPattern::dump() {
case URX_SETREF:
case URX_STATE_SAVE:
case URX_JMP:
case URX_BACKSLASH_B:
case URX_BACKSLASH_W:
case URX_BACKSLASH_Z:
// types with an integer operand field.
printf("%d", val);
break;

View file

@ -5,16 +5,23 @@
********************************************************************/
//
// regex.cpp
// regextst.cpp
//
// ICU Regular Expressions test, part of intltest.
//
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "intltest.h"
#include "regextst.h"
#include "uvector.h"
//---------------------------------------------------------------------------
//
// Test class boilerplate
//
//---------------------------------------------------------------------------
RegexTest::RegexTest()
{
};
@ -43,12 +50,36 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
case 3: name = "API_Pattern";
if (exec) API_Pattern();
break;
case 4: name = "Extended";
if (exec) Extended();
break;
default: name = "";
break; //needed to end loop
}
}
//---------------------------------------------------------------------------
//
// Error Checking / Reporting macros used in all of the tests.
//
//---------------------------------------------------------------------------
#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%d\n", \
__LINE__, status); return;}}
#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
if (status!=errcode) {errln("RegexTest failure at line %d.\n", __LINE__);};}
#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
"RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
//---------------------------------------------------------------------------
//
// REGEX_TESTLM Macro + invocation function to simplify writing quick tests
@ -62,13 +93,6 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
//
//
//---------------------------------------------------------------------------
#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%d\n", \
__LINE__, status); return;}}
#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
if (status!=errcode) {errln("RegexTest failure at line %d.\n", __LINE__);};}
#define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__);
@ -129,9 +153,156 @@ UBool RegexTest::doRegexLMTest(char *pat, char *text, UBool looking, UBool match
}
//---------------------------------------------------------------------------
//
// API_Match
// REGEX_FIND Macro + invocation function to simplify writing tests
// regex tests.
//
// usage:
// REGEX_FIND("pattern", "input text");
// REGEX_FIND_S("pattern", "input text", expected status);
//
// The input text is unescaped. The pattern is not.
// The input text is marked with the expected match positions
// <0>text <1> more text </1> </0>
// The <n> </n> tags are removed before trying the match.
// The tags mark the start and end of the match and of any capture groups.
//
//
//---------------------------------------------------------------------------
// REGEX_FIND is invoked via a macro, which allows capturing the source file line
// number for use in error messages.
#define REGEX_FIND(pat, text) regex_find(pat, text, U_ZERO_ERROR, __LINE__);
#define REGEX_FIND_S(pat, text, status) regex_find(pat, text, status, __LINE__);
// Set a value into a UVector at position specified by a decimal number in
// a UnicodeString. This is a utility function needed by the actual test function,
// which follows.
void set(UVector &vec, int val, UnicodeString index) {
UErrorCode status=U_ZERO_ERROR;
int idx = 0;
for (int i=0; i<index.length(); i++) {
int d=u_charDigitValue(index.charAt(i));
if (d<0) {return;}
idx = idx*10 + d;
}
while (vec.size()<idx+1) {vec.addElement(-1, status);}
vec.setElementAt(val, idx);
}
void RegexTest::regex_find(char *pat, char *input, UErrorCode expectedStatus, int line) {
UnicodeString pattern(pat);
UnicodeString inputString(input);
UnicodeString unEscapedInput;
UnicodeString deTaggedInput;
UErrorCode status = U_ZERO_ERROR;
UParseError pe;
RegexPattern *parsePat = NULL;
RegexMatcher *parseMatcher = NULL;
RegexPattern *callerPattern = NULL;
RegexMatcher *matcher = NULL;
UVector groupStarts(status);
UVector groupEnds(status);
UBool isMatch;
UBool failed = FALSE;
//
// Compile the caller's pattern
//
UnicodeString patString(pat);
callerPattern = RegexPattern::compile(patString, 0, pe, status);
if (status != expectedStatus) {
errln("Line %d: error %x compiling pattern.", line, status);
goto cleanupAndReturn;
}
//
// Find the tags in the input data, remove them, and record the group boundary
// positions.
//
parsePat = RegexPattern::compile("<(/?)([0-9]+)>", 0, pe, status);
REGEX_CHECK_STATUS_L(line);
unEscapedInput = inputString.unescape();
parseMatcher = parsePat->matcher(unEscapedInput, status);
REGEX_CHECK_STATUS_L(line);
while(parseMatcher->find()) {
parseMatcher->appendReplacement(deTaggedInput, "", status);
REGEX_CHECK_STATUS;
UnicodeString groupNum = parseMatcher->group(2, status);
if (parseMatcher->group(1, status) == "/") {
// close tag
set(groupEnds, deTaggedInput.length(), groupNum);
} else {
set(groupStarts, deTaggedInput.length(), groupNum);
}
}
parseMatcher->appendTail(deTaggedInput);
REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
//
// Do a find on the de-tagged input using the caller's pattern
//
matcher = callerPattern->matcher(deTaggedInput, status);
REGEX_CHECK_STATUS_L(line);
isMatch = matcher->find();
//
// Match up the groups from the find() with the groups from the tags
//
// number of tags should match number of groups from find operation.
// matcher->groupCount does not include group 0, the entire match, hence the +1.
if (isMatch == FALSE && groupStarts.size() != 0) {
errln("Error at line %d: Match expected, but none found.\n", line);
goto cleanupAndReturn;
}
int i;
for (i=0; i<=matcher->groupCount(); i++) {
int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
if (matcher->start(i, status) != expectedStart) {
errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
line, i, expectedStart, matcher->start(i, status));
failed = TRUE;
goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
}
int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
if (matcher->end(i, status) != expectedEnd) {
errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
line, i, expectedEnd, matcher->end(i, status));
failed = TRUE;
// Error on end position; keep going; real error is probably yet to come as group
// end positions work from end of the input data towards the front.
}
}
if ( matcher->groupCount()+1 < groupStarts.size()) {
errln("Error at line %d: Expected %d capture groups, found %d.",
line, groupStarts.size()-1, matcher->groupCount());
failed = TRUE;
}
cleanupAndReturn:
if (failed) {
callerPattern->dump();
}
delete parseMatcher;
delete parsePat;
delete matcher;
delete callerPattern;
}
//---------------------------------------------------------------------------
//
// API_Match Test that the API for class RegexMatcher
// is present and nominally working, but excluding functions
// implementing replace operations.
//
//---------------------------------------------------------------------------
void RegexTest::API_Match() {
@ -388,6 +559,13 @@ void RegexTest::API_Match() {
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "bcbcdefg");
// TODO: need more through testing of capture substitutions.
//
// Non-Grouping parentheses
//
}
@ -396,10 +574,13 @@ void RegexTest::API_Match() {
//---------------------------------------------------------------------------
//
// Basic Check for basic functionality of
// regex pattern matching.
// Basic Check for basic functionality of regex pattern matching.
// Avoid the use of REGEX_FIND test macro, which has
// substantial dependencies on basic Regex functionality.
//
//---------------------------------------------------------------------------
void RegexTest::Basic() {
@ -536,22 +717,125 @@ void RegexTest::Basic() {
REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
// Escape of special chars in patterns
REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
};
//---------------------------------------------------------------------------
//
// API_Replace
// API_Replace API test for class RegexMatcher, testing the
// Replace family of functions.
//
//---------------------------------------------------------------------------
void RegexTest::API_Replace() {
//
// Replace
//
int32_t flags=0;
UParseError pe;
UErrorCode status=U_ZERO_ERROR;
UnicodeString re("abc");
RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
REGEX_CHECK_STATUS;
UnicodeString data = ".abc..abc...abc..";
// 012345678901234567
RegexMatcher *matcher = pat->matcher(data, status);
//
// Plain vanilla matches.
//
UnicodeString dest;
dest = matcher->replaceFirst("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == ".yz..abc...abc..");
dest = matcher->replaceAll("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == ".yz..yz...yz..");
//
// Plain vanilla non-matches.
//
UnicodeString d2 = ".abx..abx...abx..";
matcher->reset(d2);
dest = matcher->replaceFirst("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == ".abx..abx...abx..");
dest = matcher->replaceAll("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == ".abx..abx...abx..");
//
// Empty source string
//
UnicodeString d3 = "";
matcher->reset(d3);
dest = matcher->replaceFirst("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "");
dest = matcher->replaceAll("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "");
//
// Empty substitution string
//
matcher->reset(data); // ".abc..abc...abc.."
dest = matcher->replaceFirst("", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "...abc...abc..");
dest = matcher->replaceAll("", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "........");
//
// match whole string
//
UnicodeString d4 = "abc";
matcher->reset(d4);
dest = matcher->replaceFirst("xyz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "xyz");
dest = matcher->replaceAll("xyz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "xyz");
//
// Capture Group, simple case
//
UnicodeString re2("a(..)");
RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
REGEX_CHECK_STATUS;
UnicodeString d5 = "abcdefg";
RegexMatcher *matcher2 = pat2->matcher(d5, status);
REGEX_CHECK_STATUS;
dest = matcher2->replaceFirst("$1$1", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "bcbcdefg");
// TODO: need more through testing of capture substitutions.
//
// Non-Grouping parentheses
//
}
//---------------------------------------------------------------------------
//
// API_Pattern
// API_Pattern Test that the API for class RegexPattern is
// present and nominally working.
//
//---------------------------------------------------------------------------
void RegexTest::API_Pattern() {
@ -688,6 +972,36 @@ void RegexTest::API_Pattern() {
//---------------------------------------------------------------------------
//
// Extended A more thorough check for features of regex patterns
//
//---------------------------------------------------------------------------
void RegexTest::Extended() {
// Capturing parens
REGEX_FIND(".(..).", "<0>a<1>bc</1>d</0>");
REGEX_FIND(".*\\A( +hello)", "<0><1> hello</1></0>");
REGEX_FIND("(hello)|(goodbye)", "<0><1>hello</1></0>");
REGEX_FIND("(hello)|(goodbye)", "<0><2>goodbye</2></0>");
REGEX_FIND("abc( +( inner(X?) +) xyz)", "leading cruft <0>abc<1> <2> inner<3></3> </2> xyz</1></0> cruft");
// Non-capturing parens (?: stuff). Groups, but does not capture.
REGEX_FIND("(?:abc)*(tail)", "<0>abcabcabc<1>tail</1></0>");
// Non-greedy *? quantifier
REGEX_FIND(".*?(abc)", "<0> abx <1>abc</1></0> abc abc abc");
REGEX_FIND(".*(abc)", "<0> abx abc abc abc <1>abc</1></0>");
REGEX_FIND( "((?:abc |xyz )*?)abc ", "<0><1>xyz </1>abc </0>abc abc ");
REGEX_FIND( "((?:abc |xyz )*)abc ", "<0><1>xyz abc abc </1>abc </0>");
// Non-greedy +? quantifier
REGEX_FIND( "(a+?)(a*)", "<0><1>a</1><2>aaaaaaaaaaaa</2></0>");
REGEX_FIND( "(a+)(a*)", "<0><1>aaaaaaaaaaaaa</1><2></2></0>");
REGEX_FIND( "((ab)+?)((ab)*)", "<0><1><2>ab</2></1><3>ababababab<4>ab</4></3></0>");
REGEX_FIND( "((ab)+)((ab)*)", "<0><1>abababababab<2>ab</2></1><3></3></0>");
}

View file

@ -25,8 +25,9 @@ public:
virtual void API_Pattern();
virtual void API_Replace();
virtual void Basic();
virtual void Extended();
virtual UBool doRegexLMTest(char *pat, char *text, UBool looking, UBool match, int line);
virtual void regex_find(char *pat, char *input, UErrorCode expectedStatus, int line);
};
#endif