ICU-2422 regexp, tests from perl, and some bug fixes

X-SVN-Rev: 10901
This commit is contained in:
Andy Heninger 2003-01-24 02:05:03 +00:00
parent f092768650
commit a92820c54b
7 changed files with 453 additions and 97 deletions

View file

@ -154,6 +154,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
fQuoteMode = FALSE;
fFreeForm = FALSE;
fMatcherDataEnd = 0;
fBackRefMax = 0;
fMatchOpenParen = -1;
fMatchCloseParen = -1;
@ -371,6 +372,24 @@ void RegexCompile::compile(
// The pattern has now been read and processed, and the compiled code generated.
//
// Back-reference fixup
//
int32_t loc;
for (loc=0; loc<fRXPat->fCompiledPat->size(); loc++) {
int32_t op = fRXPat->fCompiledPat->elementAti(loc);
if (URX_TYPE(op) == URX_BACKREF) {
int32_t where = URX_VAL(op);
if (where > fRXPat->fGroupMap->size()) {
error(U_REGEX_INVALID_BACK_REF);
break;
}
where = fRXPat->fGroupMap->elementAti(where-1);
op = URX_BUILD(URX_BACKREF, where);
fRXPat->fCompiledPat->setElementAt(op, loc);
}
}
//
// Compute the number of digits requried for the largest capture group number.
//
@ -608,6 +627,14 @@ UBool RegexCompile::doParseActions(EParseAction action)
error(U_REGEX_UNIMPLEMENTED);
break;
case doConditionalExpr:
// Conditionals such as (?(1)a:b)
case doPerlInline:
// Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
error(U_REGEX_UNIMPLEMENTED);
break;
case doCloseParen:
handleCloseParen();
if (fParenStack.size() <= 0) {
@ -896,6 +923,10 @@ UBool RegexCompile::doParseActions(EParseAction action)
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
break;
case doEscapeError:
error(U_REGEX_BAD_ESCAPE_SEQUENCE);
break;
case doExit:
returnVal = FALSE;
break;
@ -929,9 +960,8 @@ UBool RegexCompile::doParseActions(EParseAction action)
int32_t numCaptureGroups = fRXPat->fGroupMap->size();
int32_t groupNum = 0;
UChar32 c = fC.fChar;
int32_t t;
for (t=numCaptureGroups; t>0; t=t/10) {
for (;;) {
// Loop once per digit, for max allowed number of digits in a back reference.
groupNum = groupNum * 10 + u_charDigitValue(c);
if (groupNum >= numCaptureGroups) {
@ -943,16 +973,15 @@ UBool RegexCompile::doParseActions(EParseAction action)
}
nextCharLL();
}
if (groupNum > numCaptureGroups) {
error(U_REGEX_INVALID_BACK_REF);
break;
}
// Scan of the back reference in the source regexp is complete. Now generate
// the compiled code for it.
// the compiled code for it.
// Because capture groups can be forward-referenced by back-references,
// we fill the operand with the capture group number. At the end
// of compilation, it will be changed to the variables location.
U_ASSERT(groupNum > 0);
int32_t varsLoc = fRXPat->fGroupMap->elementAti(groupNum-1);
int32_t op = URX_BUILD(URX_BACKREF, varsLoc);
// int32_t varsLoc = fRXPat->fGroupMap->elementAti(groupNum-1);
int32_t op = URX_BUILD(URX_BACKREF, groupNum);
fRXPat->fCompiledPat->addElement(op, *fStatus);
}
break;

View file

@ -154,6 +154,12 @@ private:
int32_t fMatcherDataEnd; // Location Counter for allocation of data
// to be used by the matcher at match time.
int32_t fBackRefMax; // Number of the largest capture group with a
// back reference. Capture groups can be forward-
// referenced, so we can't flag an error on
// a too-big back ref number until the end of the
// pattern is reached.
};
U_NAMESPACE_END

View file

@ -56,6 +56,7 @@ enum Regex_PatternParseAction {
doPatFinish,
doBackslashD,
doPossesiveOpt,
doEscapeError,
doBackslashG,
doOpt,
doInterval,
@ -72,7 +73,9 @@ enum Regex_PatternParseAction {
doBackslashX,
doScanUnicodeSet,
doBackslashZ,
doPerlInline,
doNOP,
doConditionalExpr,
doExit,
doNGInterval,
doPatStart,
@ -107,13 +110,13 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doDotAny, 46 /* . */, 12,0, TRUE} // 6
, {doCaret, 94 /* ^ */, 2,0, TRUE} // 7
, {doDollar, 36 /* $ */, 2,0, TRUE} // 8
, {doNOP, 92 /* \ */, 70,0, TRUE} // 9
, {doNOP, 92 /* \ */, 72,0, TRUE} // 9
, {doPatFinish, 253, 2,0, FALSE} // 10
, {doRuleError, 255, 91,0, FALSE} // 11
, {doNOP, 42 /* * */, 48,0, TRUE} // 12 expr-quant
, {doNOP, 43 /* + */, 51,0, TRUE} // 13
, {doNOP, 63 /* ? */, 54,0, TRUE} // 14
, {doIntervalInit, 123 /* { */, 57,0, TRUE} // 15
, {doRuleError, 255, 94,0, FALSE} // 11
, {doNOP, 42 /* * */, 50,0, TRUE} // 12 expr-quant
, {doNOP, 43 /* + */, 53,0, TRUE} // 13
, {doNOP, 63 /* ? */, 56,0, TRUE} // 14
, {doIntervalInit, 123 /* { */, 59,0, TRUE} // 15
, {doNOP, 255, 17,0, FALSE} // 16
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 17 expr-cont
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 18
@ -124,72 +127,75 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOpenAtomicParen, 62 /* > */, 2, 12, TRUE} // 23
, {doOpenLookAhead, 61 /* = */, 2, 17, TRUE} // 24
, {doOpenLookAheadNeg, 33 /* ! */, 2, 17, TRUE} // 25
, {doNOP, 60 /* < */, 34,0, TRUE} // 26
, {doNOP, 35 /* # */, 37,0, TRUE} // 27
, {doMatchMode, 105 /* i */, 40,0, TRUE} // 28
, {doMatchMode, 120 /* x */, 40,0, TRUE} // 29
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 30
, {doMatchMode, 109 /* m */, 40,0, TRUE} // 31
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 32
, {doBadOpenParenType, 255, 91,0, FALSE} // 33
, {doOpenLookBehind, 61 /* = */, 2, 17, TRUE} // 34 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE} // 35
, {doBadOpenParenType, 255, 91,0, FALSE} // 36
, {doNOP, 41 /* ) */, 2,0, TRUE} // 37 paren-comment
, {doMismatchedParenErr, 253, 91,0, FALSE} // 38
, {doNOP, 255, 37,0, TRUE} // 39
, {doMatchMode, 105 /* i */, 40,0, TRUE} // 40 paren-flag
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 41
, {doMatchMode, 109 /* m */, 40,0, TRUE} // 42
, {doMatchMode, 120 /* x */, 40,0, TRUE} // 43
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 44
, {doNOP, 41 /* ) */, 2,0, TRUE} // 45
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 46
, {doNOP, 255, 91,0, FALSE} // 47
, {doNGStar, 63 /* ? */, 17,0, TRUE} // 48 quant-star
, {doPossesiveStar, 43 /* + */, 17,0, TRUE} // 49
, {doStar, 255, 17,0, FALSE} // 50
, {doNGPlus, 63 /* ? */, 17,0, TRUE} // 51 quant-plus
, {doPossesivePlus, 43 /* + */, 17,0, TRUE} // 52
, {doPlus, 255, 17,0, FALSE} // 53
, {doNGOpt, 63 /* ? */, 17,0, TRUE} // 54 quant-opt
, {doPossesiveOpt, 43 /* + */, 17,0, TRUE} // 55
, {doOpt, 255, 17,0, FALSE} // 56
, {doNOP, 129, 57,0, TRUE} // 57 interval-open
, {doNOP, 128, 60,0, FALSE} // 58
, {doIntervalError, 255, 91,0, FALSE} // 59
, {doIntevalLowerDigit, 128, 60,0, TRUE} // 60 interval-lower
, {doNOP, 44 /* , */, 64,0, TRUE} // 61
, {doIntervalSame, 125 /* } */, 67,0, TRUE} // 62
, {doIntervalError, 255, 91,0, FALSE} // 63
, {doIntervalUpperDigit, 128, 64,0, TRUE} // 64 interval-upper
, {doNOP, 125 /* } */, 67,0, TRUE} // 65
, {doIntervalError, 255, 91,0, FALSE} // 66
, {doNGInterval, 63 /* ? */, 17,0, TRUE} // 67 interval-type
, {doPossesiveInterval, 43 /* + */, 17,0, TRUE} // 68
, {doInterval, 255, 17,0, FALSE} // 69
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 70 backslash
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 71
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 72
, {doBackslashd, 100 /* d */, 12,0, TRUE} // 73
, {doBackslashD, 68 /* D */, 12,0, TRUE} // 74
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 75
, {doNamedChar, 78 /* N */, 12,0, TRUE} // 76
, {doProperty, 112 /* p */, 12,0, FALSE} // 77
, {doProperty, 80 /* P */, 12,0, FALSE} // 78
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 79
, {doBackslashS, 83 /* S */, 12,0, TRUE} // 80
, {doBackslashs, 115 /* s */, 12,0, TRUE} // 81
, {doBackslashW, 87 /* W */, 12,0, TRUE} // 82
, {doBackslashw, 119 /* w */, 12,0, TRUE} // 83
, {doBackslashX, 88 /* X */, 12,0, TRUE} // 84
, {doBackslashx, 120 /* x */, 12,0, TRUE} // 85
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 86
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 87
, {doOctal, 48 /* 0 */, 12,0, TRUE} // 88
, {doBackRef, 128, 12,0, TRUE} // 89
, {doLiteralChar, 255, 12,0, TRUE} // 90
, {doExit, 255, 91,0, TRUE} // 91 errorDeath
, {doNOP, 60 /* < */, 36,0, TRUE} // 26
, {doNOP, 35 /* # */, 39,0, TRUE} // 27
, {doMatchMode, 105 /* i */, 42,0, TRUE} // 28
, {doMatchMode, 120 /* x */, 42,0, TRUE} // 29
, {doMatchMode, 115 /* s */, 42,0, TRUE} // 30
, {doMatchMode, 109 /* m */, 42,0, TRUE} // 31
, {doMatchMode, 45 /* - */, 42,0, TRUE} // 32
, {doConditionalExpr, 40 /* ( */, 94,0, TRUE} // 33
, {doPerlInline, 123 /* { */, 94,0, TRUE} // 34
, {doBadOpenParenType, 255, 94,0, FALSE} // 35
, {doOpenLookBehind, 61 /* = */, 2, 17, TRUE} // 36 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE} // 37
, {doBadOpenParenType, 255, 94,0, FALSE} // 38
, {doNOP, 41 /* ) */, 2,0, TRUE} // 39 paren-comment
, {doMismatchedParenErr, 253, 94,0, FALSE} // 40
, {doNOP, 255, 39,0, TRUE} // 41
, {doMatchMode, 105 /* i */, 42,0, TRUE} // 42 paren-flag
, {doMatchMode, 115 /* s */, 42,0, TRUE} // 43
, {doMatchMode, 109 /* m */, 42,0, TRUE} // 44
, {doMatchMode, 120 /* x */, 42,0, TRUE} // 45
, {doMatchMode, 45 /* - */, 42,0, TRUE} // 46
, {doNOP, 41 /* ) */, 2,0, TRUE} // 47
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 48
, {doNOP, 255, 94,0, FALSE} // 49
, {doNGStar, 63 /* ? */, 17,0, TRUE} // 50 quant-star
, {doPossesiveStar, 43 /* + */, 17,0, TRUE} // 51
, {doStar, 255, 17,0, FALSE} // 52
, {doNGPlus, 63 /* ? */, 17,0, TRUE} // 53 quant-plus
, {doPossesivePlus, 43 /* + */, 17,0, TRUE} // 54
, {doPlus, 255, 17,0, FALSE} // 55
, {doNGOpt, 63 /* ? */, 17,0, TRUE} // 56 quant-opt
, {doPossesiveOpt, 43 /* + */, 17,0, TRUE} // 57
, {doOpt, 255, 17,0, FALSE} // 58
, {doNOP, 129, 59,0, TRUE} // 59 interval-open
, {doNOP, 128, 62,0, FALSE} // 60
, {doIntervalError, 255, 94,0, FALSE} // 61
, {doIntevalLowerDigit, 128, 62,0, TRUE} // 62 interval-lower
, {doNOP, 44 /* , */, 66,0, TRUE} // 63
, {doIntervalSame, 125 /* } */, 69,0, TRUE} // 64
, {doIntervalError, 255, 94,0, FALSE} // 65
, {doIntervalUpperDigit, 128, 66,0, TRUE} // 66 interval-upper
, {doNOP, 125 /* } */, 69,0, TRUE} // 67
, {doIntervalError, 255, 94,0, FALSE} // 68
, {doNGInterval, 63 /* ? */, 17,0, TRUE} // 69 interval-type
, {doPossesiveInterval, 43 /* + */, 17,0, TRUE} // 70
, {doInterval, 255, 17,0, FALSE} // 71
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 72 backslash
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 73
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 74
, {doBackslashd, 100 /* d */, 12,0, TRUE} // 75
, {doBackslashD, 68 /* D */, 12,0, TRUE} // 76
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 77
, {doNamedChar, 78 /* N */, 12,0, TRUE} // 78
, {doProperty, 112 /* p */, 12,0, FALSE} // 79
, {doProperty, 80 /* P */, 12,0, FALSE} // 80
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 81
, {doBackslashS, 83 /* S */, 12,0, TRUE} // 82
, {doBackslashs, 115 /* s */, 12,0, TRUE} // 83
, {doBackslashW, 87 /* W */, 12,0, TRUE} // 84
, {doBackslashw, 119 /* w */, 12,0, TRUE} // 85
, {doBackslashX, 88 /* X */, 12,0, TRUE} // 86
, {doBackslashx, 120 /* x */, 12,0, TRUE} // 87
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 88
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 89
, {doOctal, 48 /* 0 */, 12,0, TRUE} // 90
, {doBackRef, 128, 12,0, TRUE} // 91
, {doEscapeError, 253, 94,0, FALSE} // 92
, {doLiteralChar, 255, 12,0, TRUE} // 93
, {doExit, 255, 94,0, TRUE} // 94 errorDeath
};
static const char * const RegexStateNames[] = { 0,
"start",
@ -224,6 +230,8 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
0,
"open-paren-lookbehind",
0,
@ -281,6 +289,7 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
"errorDeath",
0};

View file

@ -119,6 +119,8 @@ open-paren-extended:
's' n paren-flag doMatchMode
'm' n paren-flag doMatchMode
'-' n paren-flag doMatchMode
'(' n errorDeath doConditionalExpr
'{' n errorDeath doPerlInline
default errorDeath doBadOpenParenType
open-paren-lookbehind:
@ -230,8 +232,9 @@ backslash:
'Z' n term doBackslashZ
'z' n term doBackslashz
'0' n expr-quant doOctal
digit_char expr-quant doBackRef # Will scan multiple digits
default n expr-quant doLiteralChar # Escaped literal char.
digit_char n expr-quant doBackRef # Will scan multiple digits
eof errorDeath doEscapeError
default n expr-quant doLiteralChar # Escaped literal char.

View file

@ -24,8 +24,6 @@
#include "uvectr32.h"
#include "regeximp.h"
//#include "stdio.h"
//#include "malloc.h"
U_NAMESPACE_BEGIN
@ -222,9 +220,14 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
UBool RegexMatcher::find() {
// Start at the position of the last match end. (Will be zero if the
// matcher has been reset.
//
// TODO: Needs optimization
UErrorCode status = U_ZERO_ERROR;
int32_t startPos;
// TODO: needs to go up to the very end, so a pattern that can match a zero lenght
// string can match at the end of a string. Can't do until loop-breaking
// is added to the engine, though, otherwise it triggers too many bugs.
for (startPos=fMatchEnd; startPos < fInputLength; startPos = fInput->moveIndex32(startPos, 1)) {
MatchAt(startPos, status);
if (U_FAILURE(status)) {
@ -477,22 +480,27 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
// We are at a boundary if the this char and the original chars are
// opposite in membership in \w set
//
// parameters: pos - the current position in the input buffer
// start - the position where the match operation started.
// don't backup before this position when looking back
// for a preceding base char.
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::isWordBoundary(int32_t pos) {
UBool isBoundary = FALSE;
if (pos >= fInputLength) {
// off end of string. Not a boundary.
return FALSE;
}
UBool cIsWord = FALSE;
// Determine whether char c at Pos is a member of the word set of chars.
UChar32 c = fInput->char32At(pos);
int8_t ctype = u_charType(c);
if (ctype==U_NON_SPACING_MARK || ctype==U_ENCLOSING_MARK) {
// Current char is a combining one. Not a boundary.
return FALSE;
// Determine whether char c at current position is a member of the word set of chars.
// If we're off the end of the string, behave as though we're not at a word char.
if (pos < fInputLength) {
UChar32 c = fInput->char32At(pos);
int8_t ctype = u_charType(c);
if (ctype==U_NON_SPACING_MARK || ctype==U_ENCLOSING_MARK) {
// Current char is a combining one. Not a boundary.
return FALSE;
}
cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
}
UBool cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
// Back up until we come to a non-combining char, determine whether
// that char is a word char.

View file

@ -14,6 +14,7 @@
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
#include "intltest.h"
#include "regextst.h"
#include "uvector.h"
@ -59,6 +60,10 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
case 5: name = "Errors";
if (exec) Errors();
break;
case 6: name = "PerlTests";
// if (exec) PerlTests();
break;
default: name = "";
break; //needed to end loop
@ -368,7 +373,7 @@ void RegexTest::Basic() {
//
#if 0
{
REGEX_TESTLM("(abc)*+a", "abcabcabc", FALSE, FALSE);
REGEX_TESTLM("\\ba\\b", "-a", FALSE, TRUE);
// REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
}
@ -1109,6 +1114,8 @@ void RegexTest::Extended() {
// \b \B
REGEX_FIND( ".*?\\b(.).*", "<0> $%^&*( <1>h</1>ello123%^&*()gxx</0>");
REGEX_FIND( "\\ba\\b", "-<0>a</0>");
REGEX_FIND("\\by\\b", "xy");
// Finds first chars of up to 5 words
REGEX_FIND( "(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?",
@ -1319,5 +1326,297 @@ void RegexTest::Errors() {
}
//---------------------------------------------------------------------------
//
// PerlTests Run Perl's regexp tests.
//
//---------------------------------------------------------------------------
static UBool ReplaceFirst(UnicodeString &target, const UnicodeString &pattern,
const UnicodeString &replacement, UErrorCode &status)
{
if (U_FAILURE(status)) {
return FALSE;
}
UParseError pe;
RegexPattern *pat = NULL;
RegexMatcher *mat = NULL;
pat = RegexPattern::compile(pattern, 0, pe, status);
if (pat != NULL) {
mat = pat->matcher(target, status);
}
if (mat != NULL) {
target = mat->replaceFirst(replacement, status);
}
UBool retVal = (mat->start(0, status) != -1);
delete mat;
delete pat;
return retVal;
}
static char *cstar(const UnicodeString &s) {
UErrorCode status=U_ZERO_ERROR;
static char buf[1000];
s.extract(buf, 1000, NULL, status);
buf[999] = 0;
return buf;
}
//-------------------------------------------------------------------------------
//
// Read a text data file, convert it to UChars, and return the data
// in one big UChar * buffer, which the caller must delete.
//
//--------------------------------------------------------------------------------
UChar *RegexTest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status) {
UChar *retPtr = NULL;
char *fileBuf = NULL;
UConverter* conv = NULL;
FILE *f = NULL;
ulen = 0;
{
if (U_FAILURE(status)) {
return retPtr;
}
//
// Open the file.
//
f = fopen(fileName, "rb");
if (f == 0) {
errln("Error opening test data file %s\n", fileName);
goto cleanUpAndReturn;
}
//
// Read it in
//
fseek( f, 0, SEEK_END);
int fileSize = ftell(f);
fileBuf = new char[fileSize];
fseek(f, 0, SEEK_SET);
int amt_read = fread(fileBuf, 1, fileSize, f);
if (amt_read != fileSize || fileSize <= 0) {
errln("Error reading test data file.");
goto cleanUpAndReturn;
}
//
// Look for a Unicode Signature (BOM) on the data just read
//
int32_t signatureLength;
const char * fileBufC = fileBuf;
const char* encoding = ucnv_detectUnicodeSignature(
fileBuf, fileSize, &signatureLength, &status);
if(encoding!=NULL ){
fileBufC += signatureLength;
fileSize -= signatureLength;
}
//
// Open a converter to take the rule file to UTF-16
//
conv = ucnv_open(encoding, &status);
if (U_FAILURE(status)) {
goto cleanUpAndReturn;
}
//
// Convert the rules to UChar.
// Preflight first to determine required buffer size.
//
ulen = ucnv_toUChars(conv,
NULL, // dest,
0, // destCapacity,
fileBufC,
fileSize,
&status);
if (status == U_BUFFER_OVERFLOW_ERROR) {
// Buffer Overflow is expected from the preflight operation.
status = U_ZERO_ERROR;
}
retPtr = new UChar[ulen+1];
ucnv_toUChars(conv,
retPtr, // dest,
ulen+1,
fileBufC,
fileSize,
&status);
}
cleanUpAndReturn:
fclose(f);
delete fileBuf;
ucnv_close(conv);
if (U_FAILURE(status)) {
errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
delete retPtr;
retPtr = 0;
ulen = 0;
};
return retPtr;
}
//-------------------------------------------------------------------------------
//
// PerlTests - Run Perl's regular expression tests
//
//-------------------------------------------------------------------------------
void RegexTest::PerlTests() {
UErrorCode status = U_ZERO_ERROR;
UParseError pe;
//
// Open and read the test data file.
//
const char *testDataDirectory = loadTestData(status);
UnicodeString tdd(testDataDirectory);
ReplaceFirst(tdd, "([/\\\\])out[/\\\\]testdata", "$1re_tests.txt", status);
int len;
UChar *testData = ReadAndConvertFile(cstar(tdd), len, status);
//
// Put the test data into a UnicodeString
//
UnicodeString ruleSourceS(FALSE, testData, len);
//
// Regex to break the input file into lines, and strip the new lines.
// One line per match, capture group one is the desired data.
//
RegexPattern* linePat = RegexPattern::compile("(.+?)[\\r\\n]+", 0, pe, status);
RegexMatcher* lineMat = linePat->matcher(ruleSourceS, status);
//
// Regex to split a test file line into fields.
// There are six fields, separated by tabs.
//
RegexPattern* fieldPat = RegexPattern::compile("\\t", 0, pe, status);
//
// Regex to identify test patterns with flag settings, and to separate them.
// Test patterns with flags look like 'pattern'i
// Test patterns without flags are not quoted: paterrn
// Coming out, capture group 2 is the pattern, capture group 3 is the flags.
//
RegexPattern *flagPat = RegexPattern::compile("('?)(.*)\\1(.*)", 0, pe, status);
RegexMatcher* flagMat = flagPat->matcher("", status);
//
// Regex to find ${bang}. Perl doesn't put literal '!'s into patterns.
//
RegexPattern *bangPat = RegexPattern::compile("\\$\\{bang\\}", 0, pe, status);
RegexMatcher *bangMat = bangPat->matcher("", status);
int32_t lineNum = 0;
int32_t skippedUnimplementedCount = 0;
while (lineMat->find()) {
lineNum++;
UnicodeString line = lineMat->group(1, status);
UnicodeString fields[7];
fieldPat->split(line, fields, 7, status);
flagMat->reset(fields[0]);
flagMat->matches(status);
UnicodeString pattern = flagMat->group(2, status);
bangMat->reset(pattern);
pattern = bangMat->replaceAll("\\u0021", status);
UnicodeString flagStr = flagMat->group(3, status);
// printf("pattern = %s\n", cstar(pattern));
// printf(" flags = %s\n", cstar(flags));
if (U_FAILURE(status)) {
errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
return;
}
int32_t flags = 0;
const UChar UChar_c = 0x63; // Damn the lack of Unicode support in C
const UChar UChar_i = 0x69;
const UChar UChar_m = 0x6d;
const UChar UChar_x = 0x78;
const UChar UChar_y = 0x79;
if (flagStr.indexOf(UChar_i) != -1) {
flags |= UREGEX_CASE_INSENSITIVE;
}
if (flagStr.indexOf(UChar_m) != -1) {
flags |= UREGEX_MULTILINE;
}
if (flagStr.indexOf(UChar_x) != -1) {
flags |= UREGEX_COMMENTS;
}
//
// Compile the test pattern.
//
status = U_ZERO_ERROR;
RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
if (status == U_REGEX_UNIMPLEMENTED) {
skippedUnimplementedCount++;
delete testPat;
status = U_ZERO_ERROR;
continue;
}
if (U_FAILURE(status)) {
// Some tests are supposed to generate errors.
// Only report an error for tests that are supposed to succeed.
if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
{
errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
}
status = U_ZERO_ERROR;
delete testPat;
continue;
}
if (fields[2].indexOf(UChar_i) >= 0) {
// ICU should skip this test.
delete testPat;
continue;
}
if (fields[2].indexOf(UChar_c) >= 0) {
// This pattern should have caused a compilation error, but didn't/
errln("line %d: Expected a pattern compile error, got success.", lineNum);
delete testPat;
continue;
}
//
// Run the test
//
RegexMatcher *testMat = testPat->matcher(fields[1], status);
UBool found = testMat->find();
UBool expected = FALSE;
if (fields[2].indexOf(UChar_y) >=0) {
expected = TRUE;
}
if (expected != found) {
errln("line %d: Expected %smatch, got %smatch",
lineNum, expected?"":"no ", found?"":"no " );
}
delete testMat;
delete testPat;
}
logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
}
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

View file

@ -30,12 +30,14 @@ public:
virtual void Basic();
virtual void Extended();
virtual void Errors();
virtual void PerlTests();
// The following functions are internal to the regexp tests.
virtual UBool doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int line);
virtual void regex_find(const char *pat, const char *input, UErrorCode expectedStatus, int line);
virtual void regex_err(const char *pat, int32_t errline, int32_t errcol,
UErrorCode expectedStatus, int line);
virtual UChar *ReadAndConvertFile(const char *fileName, int &len, UErrorCode &status);
};
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS