mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-105 Regular Expressions, ongoing development
X-SVN-Rev: 10069
This commit is contained in:
parent
65d107bf3d
commit
00767a816c
9 changed files with 554 additions and 51 deletions
icu4c/source
|
@ -399,8 +399,24 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
break;
|
||||
|
||||
case doOpenNonCaptureParen:
|
||||
// Open Paren.
|
||||
break;
|
||||
// Open non-caputuring (grouping only) Paren.
|
||||
// Compile to a
|
||||
// - NOP, which later may be replaced by a save-state if the
|
||||
// parenthesized group gets a * quantifier, followed by
|
||||
// - NOP, which may later be replaced by a save-state if there
|
||||
// is an '|' alternation within the parens.
|
||||
{
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
|
||||
|
||||
// On the Parentheses stack, start a new frame and add the postions
|
||||
// of the two NOPs.
|
||||
fParenStack.push(-1, *fStatus); // Begin a new frame.
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case doOpenAtomicParen:
|
||||
// Open Paren.
|
||||
|
@ -473,6 +489,19 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
}
|
||||
break;
|
||||
|
||||
case doNGPlus:
|
||||
// Non-greedy '+?' compiles to
|
||||
// 1. stuff to be repeated (already built)
|
||||
// 2. state-save 1
|
||||
// 3. ...
|
||||
{
|
||||
int32_t topLoc = blockTopLoc(FALSE);
|
||||
int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
|
||||
fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case doOpt:
|
||||
// Normal (greedy) ? quantifier.
|
||||
// Compiles to
|
||||
|
@ -481,12 +510,21 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
// 3. ...
|
||||
// Insert the state save into the compiled pattern, and we're done.
|
||||
{
|
||||
int32_t saveStateLoc = blockTopLoc();
|
||||
int32_t saveStateLoc = blockTopLoc(TRUE);
|
||||
int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
|
||||
fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
|
||||
}
|
||||
break;
|
||||
|
||||
case doNGOpt:
|
||||
// Non-greedy ?? quantifier
|
||||
// compiles to
|
||||
// 1. jmp 4
|
||||
// 2. body of optional stuff
|
||||
// 3 jmp 5
|
||||
// 4. state save 2
|
||||
// 5 ...
|
||||
|
||||
|
||||
|
||||
case doStar:
|
||||
|
@ -499,7 +537,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
//
|
||||
{
|
||||
// location of item #1, the STATE_SAVE
|
||||
int32_t saveStateLoc = blockTopLoc();
|
||||
int32_t saveStateLoc = blockTopLoc(TRUE);
|
||||
|
||||
// Locate the position in the compiled pattern where the match will continue
|
||||
// after completing the *. (4 in the comment above)
|
||||
|
@ -516,6 +554,23 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
}
|
||||
break;
|
||||
|
||||
case doNGStar:
|
||||
// Non-greedy *? quantifier
|
||||
// compiles to
|
||||
// 1. JMP 3
|
||||
// 2. body of stuff being iterated over
|
||||
// 3. STATE_SAVE 2
|
||||
// 4 ...
|
||||
{
|
||||
int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
|
||||
int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
|
||||
int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc);
|
||||
int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
|
||||
fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
|
||||
fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case doStartString:
|
||||
// We've just scanned a single "normal" character from the pattern,
|
||||
|
@ -614,10 +669,41 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
|
||||
|
||||
case doBackslashA:
|
||||
// Scanned a "\A".
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashB:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 1), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashb:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashG:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashW:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_W, 1), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashw:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_W, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashX:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashZ:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 1), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashz:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case doExit:
|
||||
returnVal = FALSE;
|
||||
break;
|
||||
|
@ -674,8 +760,12 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
// is reserved for this purpose. .* or similar don't
|
||||
// and a slot needs to be added.
|
||||
//
|
||||
// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
|
||||
// at the returned location.
|
||||
// FALSE - just return the address, reserve a location there.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
int32_t RegexCompile::blockTopLoc() {
|
||||
int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
|
||||
int32_t theLoc;
|
||||
if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
|
||||
{
|
||||
|
@ -690,11 +780,13 @@ int32_t RegexCompile::blockTopLoc() {
|
|||
// No slot for STATE_SAVE was pre-reserved in the compiled code.
|
||||
// We need to make space now.
|
||||
theLoc = fRXPat->fCompiledPat->size()-1;
|
||||
int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
|
||||
int32_t prevType = URX_TYPE(opAtTheLoc);
|
||||
U_ASSERT(prevType==URX_ONECHAR || prevType==URX_SETREF || prevType==URX_DOTANY);
|
||||
int32_t nop = URX_BUILD(URX_NOP, 0);
|
||||
fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
|
||||
if (reserveLoc) {
|
||||
int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
|
||||
int32_t prevType = URX_TYPE(opAtTheLoc);
|
||||
U_ASSERT(prevType==URX_ONECHAR || prevType==URX_SETREF || prevType==URX_DOTANY);
|
||||
int32_t nop = URX_BUILD(URX_NOP, 0);
|
||||
fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
|
||||
}
|
||||
}
|
||||
return theLoc;
|
||||
}
|
||||
|
|
|
@ -89,9 +89,10 @@ private:
|
|||
UChar32 peekCharLL();
|
||||
UnicodeSet *scanSet();
|
||||
void handleCloseParen();
|
||||
int32_t blockTopLoc(); // Locate a position in the compiled pattern
|
||||
int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern
|
||||
// at the top of the just completed block
|
||||
// or operation.
|
||||
// or operation, and optionally ensure that
|
||||
// there is space to add an opcode there.
|
||||
|
||||
|
||||
UErrorCode *fStatus;
|
||||
|
|
|
@ -31,9 +31,11 @@ enum Regex_PatternParseAction {
|
|||
doRuleError,
|
||||
doStartString,
|
||||
doNGOpt,
|
||||
doBackslashw,
|
||||
doPossesiveStar,
|
||||
doOpenLookBehind,
|
||||
doExprRParen,
|
||||
doBackslashz,
|
||||
doStar,
|
||||
doPossesivePlus,
|
||||
doNGStar,
|
||||
|
@ -41,21 +43,27 @@ enum Regex_PatternParseAction {
|
|||
doPlus,
|
||||
doOpenNonCaptureParen,
|
||||
doBackslashA,
|
||||
doBackslashB,
|
||||
doNGPlus,
|
||||
doPatFinish,
|
||||
doIntervalMinValue,
|
||||
doIntervalDigit,
|
||||
doPossesiveOpt,
|
||||
doBackslashG,
|
||||
doOpt,
|
||||
doOpenAtomicParen,
|
||||
doStringChar,
|
||||
doOpenLookAhead,
|
||||
doNumberExpectedError,
|
||||
doDotAny,
|
||||
doBackslashW,
|
||||
doBackslashX,
|
||||
doScanUnicodeSet,
|
||||
doBackslashZ,
|
||||
doNOP,
|
||||
doExit,
|
||||
doPatStart,
|
||||
doBackslashb,
|
||||
doEndString,
|
||||
doOpenLookBehindNeg,
|
||||
doSplitString,
|
||||
|
@ -87,7 +95,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doDotAny, 46 /* . */, 18,0, TRUE} // 7
|
||||
, {doNOP, 92 /* \ */, 59,0, TRUE} // 8
|
||||
, {doNOP, 253, 2,0, FALSE} // 9
|
||||
, {doRuleError, 255, 61,0, FALSE} // 10
|
||||
, {doRuleError, 255, 69,0, FALSE} // 10
|
||||
, {doStringChar, 254, 11,0, TRUE} // 11 string
|
||||
, {doStringChar, 130, 11,0, TRUE} // 12
|
||||
, {doSplitString, 63 /* ? */, 18,0, FALSE} // 13
|
||||
|
@ -109,10 +117,10 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doOpenLookAhead, 61 /* = */, 3, 22, TRUE} // 29
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE} // 30
|
||||
, {doNOP, 60 /* < */, 33,0, TRUE} // 31
|
||||
, {doBadOpenParenType, 255, 61,0, FALSE} // 32
|
||||
, {doBadOpenParenType, 255, 69,0, FALSE} // 32
|
||||
, {doOpenLookBehind, 61 /* = */, 3, 22, TRUE} // 33 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE} // 34
|
||||
, {doBadOpenParenType, 255, 61,0, FALSE} // 35
|
||||
, {doBadOpenParenType, 255, 69,0, FALSE} // 35
|
||||
, {doNGStar, 63 /* ? */, 22,0, TRUE} // 36 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 22,0, TRUE} // 37
|
||||
, {doStar, 255, 22,0, FALSE} // 38
|
||||
|
@ -124,21 +132,29 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doOpt, 255, 22,0, FALSE} // 44
|
||||
, {doNOP, 129, 45,0, TRUE} // 45 interval-open
|
||||
, {doIntervalMinValue, 128, 48,0, FALSE} // 46
|
||||
, {doNumberExpectedError, 255, 61,0, FALSE} // 47
|
||||
, {doNumberExpectedError, 255, 69,0, FALSE} // 47
|
||||
, {doNOP, 129, 52,0, TRUE} // 48 interval-value
|
||||
, {doNOP, 125 /* } */, 52,0, FALSE} // 49
|
||||
, {doIntervalDigit, 128, 48,0, TRUE} // 50
|
||||
, {doNumberExpectedError, 255, 61,0, FALSE} // 51
|
||||
, {doNumberExpectedError, 255, 69,0, FALSE} // 51
|
||||
, {doNOP, 129, 52,0, TRUE} // 52 interval-close
|
||||
, {doTagValue, 125 /* } */, 55,0, TRUE} // 53
|
||||
, {doNumberExpectedError, 255, 61,0, FALSE} // 54
|
||||
, {doNumberExpectedError, 255, 69,0, FALSE} // 54
|
||||
, {doNOP, 254, 3,0, FALSE} // 55 expr-cont-no-interval
|
||||
, {doExprOrOperator, 124 /* | */, 3,0, TRUE} // 56
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57
|
||||
, {doNOP, 255, 3,0, FALSE} // 58
|
||||
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 59 backslash
|
||||
, {doStartString, 255, 11,0, TRUE} // 60
|
||||
, {doExit, 255, 61,0, TRUE} // 61 errorDeath
|
||||
, {doBackslashB, 66 /* B */, 3,0, TRUE} // 60
|
||||
, {doBackslashb, 98 /* b */, 3,0, TRUE} // 61
|
||||
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 62
|
||||
, {doBackslashW, 87 /* W */, 3,0, TRUE} // 63
|
||||
, {doBackslashw, 119 /* w */, 3,0, TRUE} // 64
|
||||
, {doBackslashX, 88 /* X */, 3,0, TRUE} // 65
|
||||
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 66
|
||||
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 67
|
||||
, {doStartString, 255, 11,0, TRUE} // 68
|
||||
, {doExit, 255, 69,0, TRUE} // 69 errorDeath
|
||||
};
|
||||
static const char *RegexStateNames[] = { 0,
|
||||
"start",
|
||||
|
@ -200,6 +216,14 @@ static const char *RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
"backslash",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"errorDeath",
|
||||
0};
|
||||
|
|
|
@ -212,6 +212,15 @@ expr-cont-no-interval:
|
|||
# some of them already; those won't come here.
|
||||
backslash:
|
||||
'A' n term doBackslashA
|
||||
'B' n term doBackslashB
|
||||
'b' n term doBackslashb
|
||||
'G' n term doBackslashG
|
||||
'W' n term doBackslashW
|
||||
'w' n term doBackslashw
|
||||
'X' n term doBackslashX
|
||||
'Z' n term doBackslashZ
|
||||
'z' n term doBackslashz
|
||||
|
||||
default n string doStartString
|
||||
|
||||
|
||||
|
|
|
@ -26,14 +26,21 @@ static const uint32_t URX_STATE_SAVE = 6; // Value field is pattern po
|
|||
static const uint32_t URX_NOP = 7;
|
||||
static const uint32_t URX_START_CAPTURE = 8; // Value field is capture group number.
|
||||
static const uint32_t URX_END_CAPTURE = 9; // Value field is capture group number
|
||||
static const uint32_t URX_BACKSLASH_A = 10; // Value field is index in pattern to
|
||||
// loop back to.
|
||||
static const uint32_t URX_UNUSED10 = 10;
|
||||
static const uint32_t URX_SETREF = 11; // Value field is index of set in array of sets.
|
||||
static const uint32_t URX_DOTANY = 12;
|
||||
static const uint32_t URX_JMP = 13; // Value field is destination position in
|
||||
// the pattern.
|
||||
static const uint32_t URX_FAIL = 14; // Stop match operation; No match.
|
||||
|
||||
static const uint32_t URX_BACKSLASH_A = 15;
|
||||
static const uint32_t URX_BACKSLASH_B = 16; // Value field: 0: \b 1: \B
|
||||
static const uint32_t URX_BACKSLASH_G = 17;
|
||||
static const uint32_t URX_BACKSLASH_W = 18; // Value field: 0: \w 1: \W
|
||||
static const uint32_t URX_BACKSLASH_X = 19;
|
||||
static const uint32_t URX_BACKSLASH_Z = 20; // Value field: 0: \z 1: \Z
|
||||
|
||||
|
||||
//
|
||||
// Convenience macros for assembling and disassembling a compiled operation.
|
||||
//
|
||||
|
|
|
@ -193,12 +193,15 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
|
|||
err = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
int32_t e = 0;
|
||||
int32_t e = -1;
|
||||
if (group == 0) {
|
||||
e = fMatchEnd;
|
||||
} else {
|
||||
int32_t s = fCaptureEnds->elementAti(group);
|
||||
// TODO: what to do if no match on this specific group?
|
||||
// Note: When the match engine backs out of a capture group, it sets the
|
||||
// group's start position to -1. The end position is left with junk.
|
||||
// So, before returning an end position, we must first check that
|
||||
// the start position indicates that the group matched something.
|
||||
int32_t s = fCaptureStarts->elementAti(group);
|
||||
if (s != -1) {
|
||||
e = fCaptureEnds->elementAti(group);
|
||||
}
|
||||
|
@ -457,10 +460,11 @@ void RegexMatcher::backTrack(int32_t &inputIdx, int32_t &patIdx) {
|
|||
inputIdx = fBackTrackStack->popi();
|
||||
patIdx = fBackTrackStack->popi();
|
||||
int i;
|
||||
for (i=0; i<fPattern->fNumCaptureGroups; i++) {
|
||||
if (fCaptureStarts->elementAti(i) >= inputIdx) {
|
||||
fCaptureStarts->setElementAt(i, -1);
|
||||
}
|
||||
for (i=1; i<=fPattern->fNumCaptureGroups; i++) {
|
||||
int32_t cge = fBackTrackStack->popi();
|
||||
fCaptureEnds->setElementAt(cge, i);
|
||||
int32_t cgs = fBackTrackStack->popi();
|
||||
fCaptureStarts->setElementAt(cgs, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -554,10 +558,17 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
|
||||
|
||||
case URX_STATE_SAVE:
|
||||
// When saving state for backtracking, the pattern position that a
|
||||
// backtrack should (eventually) continue at is "opValue".
|
||||
fBackTrackStack->push(opValue, status);
|
||||
fBackTrackStack->push(inputIdx, status);
|
||||
// Save the state of all capture groups, the pattern continuation
|
||||
// postion and the input position.
|
||||
{
|
||||
int i;
|
||||
for (i=fPattern->fNumCaptureGroups; i>0; i--) {
|
||||
fBackTrackStack->push(fCaptureStarts->elementAt(i), status);
|
||||
fBackTrackStack->push(fCaptureEnds->elementAt(i), status);
|
||||
}
|
||||
fBackTrackStack->push(opValue, status); // pattern continuation position
|
||||
fBackTrackStack->push(inputIdx, status); // current input position
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
|
@ -579,12 +590,44 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
fCaptureEnds->setElementAt(inputIdx, opValue);
|
||||
break;
|
||||
|
||||
case URX_BACKSLASH_A:
|
||||
case URX_BACKSLASH_A: // Test for start of input
|
||||
if (inputIdx != 0) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_BACKSLASH_B: // Test for word boundaries
|
||||
if (FALSE) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_G: // Test for position at end of previous match
|
||||
if (FALSE) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_BACKSLASH_W: // Match word chars (TODO: doesn't belong here?
|
||||
if (FALSE) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_BACKSLASH_X: // Match combining character sequence
|
||||
if (FALSE) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_BACKSLASH_Z: // Test for end of line
|
||||
if (FALSE) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case URX_SETREF:
|
||||
if (inputIdx < fInputLength) {
|
||||
|
|
|
@ -411,11 +411,17 @@ static char *opNames[] = {
|
|||
"NOP",
|
||||
"START_CAPTURE",
|
||||
"END_CAPTURE",
|
||||
"URX_BACKSLASH_A",
|
||||
"UNUSED10",
|
||||
"SETREF",
|
||||
"DOTANY",
|
||||
"JMP",
|
||||
"FAIL"
|
||||
"FAIL",
|
||||
"URX_BACKSLASH_A",
|
||||
"URX_BACKSLASH_B",
|
||||
"URX_BACKSLASH_G",
|
||||
"URX_BACKSLASH_W",
|
||||
"URX_BACKSLASH_X",
|
||||
"URX_BACKSLASH_Z"
|
||||
};
|
||||
|
||||
void RegexPattern::dump() {
|
||||
|
@ -451,6 +457,9 @@ void RegexPattern::dump() {
|
|||
case URX_NOP:
|
||||
case URX_DOTANY:
|
||||
case URX_FAIL:
|
||||
case URX_BACKSLASH_A:
|
||||
case URX_BACKSLASH_G:
|
||||
case URX_BACKSLASH_X:
|
||||
// Types with no operand field of interest.
|
||||
break;
|
||||
|
||||
|
@ -459,6 +468,9 @@ void RegexPattern::dump() {
|
|||
case URX_SETREF:
|
||||
case URX_STATE_SAVE:
|
||||
case URX_JMP:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_W:
|
||||
case URX_BACKSLASH_Z:
|
||||
// types with an integer operand field.
|
||||
printf("%d", val);
|
||||
break;
|
||||
|
|
|
@ -5,16 +5,23 @@
|
|||
********************************************************************/
|
||||
|
||||
//
|
||||
// regex.cpp
|
||||
// regextst.cpp
|
||||
//
|
||||
// ICU Regular Expressions test, part of intltest.
|
||||
//
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "intltest.h"
|
||||
#include "regextst.h"
|
||||
#include "uvector.h"
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// Test class boilerplate
|
||||
//
|
||||
//---------------------------------------------------------------------------
|
||||
RegexTest::RegexTest()
|
||||
{
|
||||
};
|
||||
|
@ -43,12 +50,36 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
|
|||
case 3: name = "API_Pattern";
|
||||
if (exec) API_Pattern();
|
||||
break;
|
||||
case 4: name = "Extended";
|
||||
if (exec) Extended();
|
||||
break;
|
||||
default: name = "";
|
||||
break; //needed to end loop
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// Error Checking / Reporting macros used in all of the tests.
|
||||
//
|
||||
//---------------------------------------------------------------------------
|
||||
#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%d\n", \
|
||||
__LINE__, status); return;}}
|
||||
|
||||
#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
|
||||
|
||||
#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
|
||||
if (status!=errcode) {errln("RegexTest failure at line %d.\n", __LINE__);};}
|
||||
|
||||
#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
|
||||
"RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
|
||||
|
||||
#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
|
||||
errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// REGEX_TESTLM Macro + invocation function to simplify writing quick tests
|
||||
|
@ -62,13 +93,6 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
|
|||
//
|
||||
//
|
||||
//---------------------------------------------------------------------------
|
||||
#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%d\n", \
|
||||
__LINE__, status); return;}}
|
||||
|
||||
#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
|
||||
|
||||
#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
|
||||
if (status!=errcode) {errln("RegexTest failure at line %d.\n", __LINE__);};}
|
||||
|
||||
#define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__);
|
||||
|
||||
|
@ -129,9 +153,156 @@ UBool RegexTest::doRegexLMTest(char *pat, char *text, UBool looking, UBool match
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// API_Match
|
||||
// REGEX_FIND Macro + invocation function to simplify writing tests
|
||||
// regex tests.
|
||||
//
|
||||
// usage:
|
||||
// REGEX_FIND("pattern", "input text");
|
||||
// REGEX_FIND_S("pattern", "input text", expected status);
|
||||
//
|
||||
// The input text is unescaped. The pattern is not.
|
||||
// The input text is marked with the expected match positions
|
||||
// <0>text <1> more text </1> </0>
|
||||
// The <n> </n> tags are removed before trying the match.
|
||||
// The tags mark the start and end of the match and of any capture groups.
|
||||
//
|
||||
//
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
// REGEX_FIND is invoked via a macro, which allows capturing the source file line
|
||||
// number for use in error messages.
|
||||
#define REGEX_FIND(pat, text) regex_find(pat, text, U_ZERO_ERROR, __LINE__);
|
||||
#define REGEX_FIND_S(pat, text, status) regex_find(pat, text, status, __LINE__);
|
||||
|
||||
|
||||
// Set a value into a UVector at position specified by a decimal number in
|
||||
// a UnicodeString. This is a utility function needed by the actual test function,
|
||||
// which follows.
|
||||
void set(UVector &vec, int val, UnicodeString index) {
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
int idx = 0;
|
||||
for (int i=0; i<index.length(); i++) {
|
||||
int d=u_charDigitValue(index.charAt(i));
|
||||
if (d<0) {return;}
|
||||
idx = idx*10 + d;
|
||||
}
|
||||
while (vec.size()<idx+1) {vec.addElement(-1, status);}
|
||||
vec.setElementAt(val, idx);
|
||||
}
|
||||
|
||||
void RegexTest::regex_find(char *pat, char *input, UErrorCode expectedStatus, int line) {
|
||||
UnicodeString pattern(pat);
|
||||
UnicodeString inputString(input);
|
||||
UnicodeString unEscapedInput;
|
||||
UnicodeString deTaggedInput;
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UParseError pe;
|
||||
RegexPattern *parsePat = NULL;
|
||||
RegexMatcher *parseMatcher = NULL;
|
||||
RegexPattern *callerPattern = NULL;
|
||||
RegexMatcher *matcher = NULL;
|
||||
UVector groupStarts(status);
|
||||
UVector groupEnds(status);
|
||||
UBool isMatch;
|
||||
UBool failed = FALSE;
|
||||
|
||||
//
|
||||
// Compile the caller's pattern
|
||||
//
|
||||
UnicodeString patString(pat);
|
||||
callerPattern = RegexPattern::compile(patString, 0, pe, status);
|
||||
if (status != expectedStatus) {
|
||||
errln("Line %d: error %x compiling pattern.", line, status);
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
|
||||
//
|
||||
// Find the tags in the input data, remove them, and record the group boundary
|
||||
// positions.
|
||||
//
|
||||
parsePat = RegexPattern::compile("<(/?)([0-9]+)>", 0, pe, status);
|
||||
REGEX_CHECK_STATUS_L(line);
|
||||
|
||||
unEscapedInput = inputString.unescape();
|
||||
parseMatcher = parsePat->matcher(unEscapedInput, status);
|
||||
REGEX_CHECK_STATUS_L(line);
|
||||
while(parseMatcher->find()) {
|
||||
parseMatcher->appendReplacement(deTaggedInput, "", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
UnicodeString groupNum = parseMatcher->group(2, status);
|
||||
if (parseMatcher->group(1, status) == "/") {
|
||||
// close tag
|
||||
set(groupEnds, deTaggedInput.length(), groupNum);
|
||||
} else {
|
||||
set(groupStarts, deTaggedInput.length(), groupNum);
|
||||
}
|
||||
}
|
||||
parseMatcher->appendTail(deTaggedInput);
|
||||
REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
|
||||
|
||||
|
||||
//
|
||||
// Do a find on the de-tagged input using the caller's pattern
|
||||
//
|
||||
matcher = callerPattern->matcher(deTaggedInput, status);
|
||||
REGEX_CHECK_STATUS_L(line);
|
||||
isMatch = matcher->find();
|
||||
|
||||
//
|
||||
// Match up the groups from the find() with the groups from the tags
|
||||
//
|
||||
|
||||
// number of tags should match number of groups from find operation.
|
||||
// matcher->groupCount does not include group 0, the entire match, hence the +1.
|
||||
if (isMatch == FALSE && groupStarts.size() != 0) {
|
||||
errln("Error at line %d: Match expected, but none found.\n", line);
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
int i;
|
||||
for (i=0; i<=matcher->groupCount(); i++) {
|
||||
int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
|
||||
if (matcher->start(i, status) != expectedStart) {
|
||||
errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
|
||||
line, i, expectedStart, matcher->start(i, status));
|
||||
failed = TRUE;
|
||||
goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
|
||||
}
|
||||
int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
|
||||
if (matcher->end(i, status) != expectedEnd) {
|
||||
errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
|
||||
line, i, expectedEnd, matcher->end(i, status));
|
||||
failed = TRUE;
|
||||
// Error on end position; keep going; real error is probably yet to come as group
|
||||
// end positions work from end of the input data towards the front.
|
||||
}
|
||||
}
|
||||
if ( matcher->groupCount()+1 < groupStarts.size()) {
|
||||
errln("Error at line %d: Expected %d capture groups, found %d.",
|
||||
line, groupStarts.size()-1, matcher->groupCount());
|
||||
failed = TRUE;
|
||||
}
|
||||
|
||||
cleanupAndReturn:
|
||||
if (failed) {
|
||||
callerPattern->dump();
|
||||
}
|
||||
delete parseMatcher;
|
||||
delete parsePat;
|
||||
delete matcher;
|
||||
delete callerPattern;
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// API_Match Test that the API for class RegexMatcher
|
||||
// is present and nominally working, but excluding functions
|
||||
// implementing replace operations.
|
||||
//
|
||||
//---------------------------------------------------------------------------
|
||||
void RegexTest::API_Match() {
|
||||
|
@ -388,6 +559,13 @@ void RegexTest::API_Match() {
|
|||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "bcbcdefg");
|
||||
|
||||
// TODO: need more through testing of capture substitutions.
|
||||
|
||||
|
||||
//
|
||||
// Non-Grouping parentheses
|
||||
//
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -396,10 +574,13 @@ void RegexTest::API_Match() {
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// Basic Check for basic functionality of
|
||||
// regex pattern matching.
|
||||
// Basic Check for basic functionality of regex pattern matching.
|
||||
// Avoid the use of REGEX_FIND test macro, which has
|
||||
// substantial dependencies on basic Regex functionality.
|
||||
//
|
||||
//---------------------------------------------------------------------------
|
||||
void RegexTest::Basic() {
|
||||
|
@ -536,22 +717,125 @@ void RegexTest::Basic() {
|
|||
REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
|
||||
REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
|
||||
|
||||
// Escape of special chars in patterns
|
||||
REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// API_Replace
|
||||
// API_Replace API test for class RegexMatcher, testing the
|
||||
// Replace family of functions.
|
||||
//
|
||||
//---------------------------------------------------------------------------
|
||||
void RegexTest::API_Replace() {
|
||||
//
|
||||
// Replace
|
||||
//
|
||||
int32_t flags=0;
|
||||
UParseError pe;
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
|
||||
UnicodeString re("abc");
|
||||
RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
UnicodeString data = ".abc..abc...abc..";
|
||||
// 012345678901234567
|
||||
RegexMatcher *matcher = pat->matcher(data, status);
|
||||
|
||||
//
|
||||
// Plain vanilla matches.
|
||||
//
|
||||
UnicodeString dest;
|
||||
dest = matcher->replaceFirst("yz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == ".yz..abc...abc..");
|
||||
|
||||
dest = matcher->replaceAll("yz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == ".yz..yz...yz..");
|
||||
|
||||
//
|
||||
// Plain vanilla non-matches.
|
||||
//
|
||||
UnicodeString d2 = ".abx..abx...abx..";
|
||||
matcher->reset(d2);
|
||||
dest = matcher->replaceFirst("yz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == ".abx..abx...abx..");
|
||||
|
||||
dest = matcher->replaceAll("yz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == ".abx..abx...abx..");
|
||||
|
||||
//
|
||||
// Empty source string
|
||||
//
|
||||
UnicodeString d3 = "";
|
||||
matcher->reset(d3);
|
||||
dest = matcher->replaceFirst("yz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "");
|
||||
|
||||
dest = matcher->replaceAll("yz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "");
|
||||
|
||||
//
|
||||
// Empty substitution string
|
||||
//
|
||||
matcher->reset(data); // ".abc..abc...abc.."
|
||||
dest = matcher->replaceFirst("", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "...abc...abc..");
|
||||
|
||||
dest = matcher->replaceAll("", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "........");
|
||||
|
||||
//
|
||||
// match whole string
|
||||
//
|
||||
UnicodeString d4 = "abc";
|
||||
matcher->reset(d4);
|
||||
dest = matcher->replaceFirst("xyz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "xyz");
|
||||
|
||||
dest = matcher->replaceAll("xyz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "xyz");
|
||||
|
||||
//
|
||||
// Capture Group, simple case
|
||||
//
|
||||
UnicodeString re2("a(..)");
|
||||
RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
UnicodeString d5 = "abcdefg";
|
||||
RegexMatcher *matcher2 = pat2->matcher(d5, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
dest = matcher2->replaceFirst("$1$1", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "bcbcdefg");
|
||||
|
||||
// TODO: need more through testing of capture substitutions.
|
||||
|
||||
|
||||
//
|
||||
// Non-Grouping parentheses
|
||||
//
|
||||
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// API_Pattern
|
||||
// API_Pattern Test that the API for class RegexPattern is
|
||||
// present and nominally working.
|
||||
//
|
||||
//---------------------------------------------------------------------------
|
||||
void RegexTest::API_Pattern() {
|
||||
|
@ -688,6 +972,36 @@ void RegexTest::API_Pattern() {
|
|||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// Extended A more thorough check for features of regex patterns
|
||||
//
|
||||
//---------------------------------------------------------------------------
|
||||
void RegexTest::Extended() {
|
||||
// Capturing parens
|
||||
REGEX_FIND(".(..).", "<0>a<1>bc</1>d</0>");
|
||||
REGEX_FIND(".*\\A( +hello)", "<0><1> hello</1></0>");
|
||||
REGEX_FIND("(hello)|(goodbye)", "<0><1>hello</1></0>");
|
||||
REGEX_FIND("(hello)|(goodbye)", "<0><2>goodbye</2></0>");
|
||||
REGEX_FIND("abc( +( inner(X?) +) xyz)", "leading cruft <0>abc<1> <2> inner<3></3> </2> xyz</1></0> cruft");
|
||||
|
||||
// Non-capturing parens (?: stuff). Groups, but does not capture.
|
||||
REGEX_FIND("(?:abc)*(tail)", "<0>abcabcabc<1>tail</1></0>");
|
||||
|
||||
// Non-greedy *? quantifier
|
||||
REGEX_FIND(".*?(abc)", "<0> abx <1>abc</1></0> abc abc abc");
|
||||
REGEX_FIND(".*(abc)", "<0> abx abc abc abc <1>abc</1></0>");
|
||||
|
||||
REGEX_FIND( "((?:abc |xyz )*?)abc ", "<0><1>xyz </1>abc </0>abc abc ");
|
||||
REGEX_FIND( "((?:abc |xyz )*)abc ", "<0><1>xyz abc abc </1>abc </0>");
|
||||
|
||||
// Non-greedy +? quantifier
|
||||
REGEX_FIND( "(a+?)(a*)", "<0><1>a</1><2>aaaaaaaaaaaa</2></0>");
|
||||
REGEX_FIND( "(a+)(a*)", "<0><1>aaaaaaaaaaaaa</1><2></2></0>");
|
||||
|
||||
REGEX_FIND( "((ab)+?)((ab)*)", "<0><1><2>ab</2></1><3>ababababab<4>ab</4></3></0>");
|
||||
REGEX_FIND( "((ab)+)((ab)*)", "<0><1>abababababab<2>ab</2></1><3></3></0>");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -25,8 +25,9 @@ public:
|
|||
virtual void API_Pattern();
|
||||
virtual void API_Replace();
|
||||
virtual void Basic();
|
||||
virtual void Extended();
|
||||
|
||||
virtual UBool doRegexLMTest(char *pat, char *text, UBool looking, UBool match, int line);
|
||||
|
||||
virtual void regex_find(char *pat, char *input, UErrorCode expectedStatus, int line);
|
||||
};
|
||||
#endif
|
||||
|
|
Loading…
Add table
Reference in a new issue