mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-2422 regexp, tests from perl, and some bug fixes
X-SVN-Rev: 10901
This commit is contained in:
parent
f092768650
commit
a92820c54b
7 changed files with 453 additions and 97 deletions
icu4c/source
|
@ -154,6 +154,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
|
|||
fQuoteMode = FALSE;
|
||||
fFreeForm = FALSE;
|
||||
fMatcherDataEnd = 0;
|
||||
fBackRefMax = 0;
|
||||
|
||||
fMatchOpenParen = -1;
|
||||
fMatchCloseParen = -1;
|
||||
|
@ -371,6 +372,24 @@ void RegexCompile::compile(
|
|||
// The pattern has now been read and processed, and the compiled code generated.
|
||||
//
|
||||
|
||||
// Back-reference fixup
|
||||
//
|
||||
int32_t loc;
|
||||
for (loc=0; loc<fRXPat->fCompiledPat->size(); loc++) {
|
||||
int32_t op = fRXPat->fCompiledPat->elementAti(loc);
|
||||
if (URX_TYPE(op) == URX_BACKREF) {
|
||||
int32_t where = URX_VAL(op);
|
||||
if (where > fRXPat->fGroupMap->size()) {
|
||||
error(U_REGEX_INVALID_BACK_REF);
|
||||
break;
|
||||
}
|
||||
where = fRXPat->fGroupMap->elementAti(where-1);
|
||||
op = URX_BUILD(URX_BACKREF, where);
|
||||
fRXPat->fCompiledPat->setElementAt(op, loc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Compute the number of digits requried for the largest capture group number.
|
||||
//
|
||||
|
@ -608,6 +627,14 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
case doConditionalExpr:
|
||||
// Conditionals such as (?(1)a:b)
|
||||
case doPerlInline:
|
||||
// Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
|
||||
case doCloseParen:
|
||||
handleCloseParen();
|
||||
if (fParenStack.size() <= 0) {
|
||||
|
@ -896,6 +923,10 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case doEscapeError:
|
||||
error(U_REGEX_BAD_ESCAPE_SEQUENCE);
|
||||
break;
|
||||
|
||||
case doExit:
|
||||
returnVal = FALSE;
|
||||
break;
|
||||
|
@ -929,9 +960,8 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
int32_t numCaptureGroups = fRXPat->fGroupMap->size();
|
||||
int32_t groupNum = 0;
|
||||
UChar32 c = fC.fChar;
|
||||
int32_t t;
|
||||
|
||||
for (t=numCaptureGroups; t>0; t=t/10) {
|
||||
for (;;) {
|
||||
// Loop once per digit, for max allowed number of digits in a back reference.
|
||||
groupNum = groupNum * 10 + u_charDigitValue(c);
|
||||
if (groupNum >= numCaptureGroups) {
|
||||
|
@ -943,16 +973,15 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
}
|
||||
nextCharLL();
|
||||
}
|
||||
if (groupNum > numCaptureGroups) {
|
||||
error(U_REGEX_INVALID_BACK_REF);
|
||||
break;
|
||||
}
|
||||
|
||||
// Scan of the back reference in the source regexp is complete. Now generate
|
||||
// the compiled code for it.
|
||||
// the compiled code for it.
|
||||
// Because capture groups can be forward-referenced by back-references,
|
||||
// we fill the operand with the capture group number. At the end
|
||||
// of compilation, it will be changed to the variables location.
|
||||
U_ASSERT(groupNum > 0);
|
||||
int32_t varsLoc = fRXPat->fGroupMap->elementAti(groupNum-1);
|
||||
int32_t op = URX_BUILD(URX_BACKREF, varsLoc);
|
||||
// int32_t varsLoc = fRXPat->fGroupMap->elementAti(groupNum-1);
|
||||
int32_t op = URX_BUILD(URX_BACKREF, groupNum);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
}
|
||||
break;
|
||||
|
|
|
@ -154,6 +154,12 @@ private:
|
|||
|
||||
int32_t fMatcherDataEnd; // Location Counter for allocation of data
|
||||
// to be used by the matcher at match time.
|
||||
|
||||
int32_t fBackRefMax; // Number of the largest capture group with a
|
||||
// back reference. Capture groups can be forward-
|
||||
// referenced, so we can't flag an error on
|
||||
// a too-big back ref number until the end of the
|
||||
// pattern is reached.
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -56,6 +56,7 @@ enum Regex_PatternParseAction {
|
|||
doPatFinish,
|
||||
doBackslashD,
|
||||
doPossesiveOpt,
|
||||
doEscapeError,
|
||||
doBackslashG,
|
||||
doOpt,
|
||||
doInterval,
|
||||
|
@ -72,7 +73,9 @@ enum Regex_PatternParseAction {
|
|||
doBackslashX,
|
||||
doScanUnicodeSet,
|
||||
doBackslashZ,
|
||||
doPerlInline,
|
||||
doNOP,
|
||||
doConditionalExpr,
|
||||
doExit,
|
||||
doNGInterval,
|
||||
doPatStart,
|
||||
|
@ -107,13 +110,13 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doDotAny, 46 /* . */, 12,0, TRUE} // 6
|
||||
, {doCaret, 94 /* ^ */, 2,0, TRUE} // 7
|
||||
, {doDollar, 36 /* $ */, 2,0, TRUE} // 8
|
||||
, {doNOP, 92 /* \ */, 70,0, TRUE} // 9
|
||||
, {doNOP, 92 /* \ */, 72,0, TRUE} // 9
|
||||
, {doPatFinish, 253, 2,0, FALSE} // 10
|
||||
, {doRuleError, 255, 91,0, FALSE} // 11
|
||||
, {doNOP, 42 /* * */, 48,0, TRUE} // 12 expr-quant
|
||||
, {doNOP, 43 /* + */, 51,0, TRUE} // 13
|
||||
, {doNOP, 63 /* ? */, 54,0, TRUE} // 14
|
||||
, {doIntervalInit, 123 /* { */, 57,0, TRUE} // 15
|
||||
, {doRuleError, 255, 94,0, FALSE} // 11
|
||||
, {doNOP, 42 /* * */, 50,0, TRUE} // 12 expr-quant
|
||||
, {doNOP, 43 /* + */, 53,0, TRUE} // 13
|
||||
, {doNOP, 63 /* ? */, 56,0, TRUE} // 14
|
||||
, {doIntervalInit, 123 /* { */, 59,0, TRUE} // 15
|
||||
, {doNOP, 255, 17,0, FALSE} // 16
|
||||
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 17 expr-cont
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 18
|
||||
|
@ -124,72 +127,75 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doOpenAtomicParen, 62 /* > */, 2, 12, TRUE} // 23
|
||||
, {doOpenLookAhead, 61 /* = */, 2, 17, TRUE} // 24
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 2, 17, TRUE} // 25
|
||||
, {doNOP, 60 /* < */, 34,0, TRUE} // 26
|
||||
, {doNOP, 35 /* # */, 37,0, TRUE} // 27
|
||||
, {doMatchMode, 105 /* i */, 40,0, TRUE} // 28
|
||||
, {doMatchMode, 120 /* x */, 40,0, TRUE} // 29
|
||||
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 30
|
||||
, {doMatchMode, 109 /* m */, 40,0, TRUE} // 31
|
||||
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 32
|
||||
, {doBadOpenParenType, 255, 91,0, FALSE} // 33
|
||||
, {doOpenLookBehind, 61 /* = */, 2, 17, TRUE} // 34 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE} // 35
|
||||
, {doBadOpenParenType, 255, 91,0, FALSE} // 36
|
||||
, {doNOP, 41 /* ) */, 2,0, TRUE} // 37 paren-comment
|
||||
, {doMismatchedParenErr, 253, 91,0, FALSE} // 38
|
||||
, {doNOP, 255, 37,0, TRUE} // 39
|
||||
, {doMatchMode, 105 /* i */, 40,0, TRUE} // 40 paren-flag
|
||||
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 41
|
||||
, {doMatchMode, 109 /* m */, 40,0, TRUE} // 42
|
||||
, {doMatchMode, 120 /* x */, 40,0, TRUE} // 43
|
||||
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 44
|
||||
, {doNOP, 41 /* ) */, 2,0, TRUE} // 45
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 46
|
||||
, {doNOP, 255, 91,0, FALSE} // 47
|
||||
, {doNGStar, 63 /* ? */, 17,0, TRUE} // 48 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 17,0, TRUE} // 49
|
||||
, {doStar, 255, 17,0, FALSE} // 50
|
||||
, {doNGPlus, 63 /* ? */, 17,0, TRUE} // 51 quant-plus
|
||||
, {doPossesivePlus, 43 /* + */, 17,0, TRUE} // 52
|
||||
, {doPlus, 255, 17,0, FALSE} // 53
|
||||
, {doNGOpt, 63 /* ? */, 17,0, TRUE} // 54 quant-opt
|
||||
, {doPossesiveOpt, 43 /* + */, 17,0, TRUE} // 55
|
||||
, {doOpt, 255, 17,0, FALSE} // 56
|
||||
, {doNOP, 129, 57,0, TRUE} // 57 interval-open
|
||||
, {doNOP, 128, 60,0, FALSE} // 58
|
||||
, {doIntervalError, 255, 91,0, FALSE} // 59
|
||||
, {doIntevalLowerDigit, 128, 60,0, TRUE} // 60 interval-lower
|
||||
, {doNOP, 44 /* , */, 64,0, TRUE} // 61
|
||||
, {doIntervalSame, 125 /* } */, 67,0, TRUE} // 62
|
||||
, {doIntervalError, 255, 91,0, FALSE} // 63
|
||||
, {doIntervalUpperDigit, 128, 64,0, TRUE} // 64 interval-upper
|
||||
, {doNOP, 125 /* } */, 67,0, TRUE} // 65
|
||||
, {doIntervalError, 255, 91,0, FALSE} // 66
|
||||
, {doNGInterval, 63 /* ? */, 17,0, TRUE} // 67 interval-type
|
||||
, {doPossesiveInterval, 43 /* + */, 17,0, TRUE} // 68
|
||||
, {doInterval, 255, 17,0, FALSE} // 69
|
||||
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 70 backslash
|
||||
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 71
|
||||
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 72
|
||||
, {doBackslashd, 100 /* d */, 12,0, TRUE} // 73
|
||||
, {doBackslashD, 68 /* D */, 12,0, TRUE} // 74
|
||||
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 75
|
||||
, {doNamedChar, 78 /* N */, 12,0, TRUE} // 76
|
||||
, {doProperty, 112 /* p */, 12,0, FALSE} // 77
|
||||
, {doProperty, 80 /* P */, 12,0, FALSE} // 78
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 79
|
||||
, {doBackslashS, 83 /* S */, 12,0, TRUE} // 80
|
||||
, {doBackslashs, 115 /* s */, 12,0, TRUE} // 81
|
||||
, {doBackslashW, 87 /* W */, 12,0, TRUE} // 82
|
||||
, {doBackslashw, 119 /* w */, 12,0, TRUE} // 83
|
||||
, {doBackslashX, 88 /* X */, 12,0, TRUE} // 84
|
||||
, {doBackslashx, 120 /* x */, 12,0, TRUE} // 85
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 86
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 87
|
||||
, {doOctal, 48 /* 0 */, 12,0, TRUE} // 88
|
||||
, {doBackRef, 128, 12,0, TRUE} // 89
|
||||
, {doLiteralChar, 255, 12,0, TRUE} // 90
|
||||
, {doExit, 255, 91,0, TRUE} // 91 errorDeath
|
||||
, {doNOP, 60 /* < */, 36,0, TRUE} // 26
|
||||
, {doNOP, 35 /* # */, 39,0, TRUE} // 27
|
||||
, {doMatchMode, 105 /* i */, 42,0, TRUE} // 28
|
||||
, {doMatchMode, 120 /* x */, 42,0, TRUE} // 29
|
||||
, {doMatchMode, 115 /* s */, 42,0, TRUE} // 30
|
||||
, {doMatchMode, 109 /* m */, 42,0, TRUE} // 31
|
||||
, {doMatchMode, 45 /* - */, 42,0, TRUE} // 32
|
||||
, {doConditionalExpr, 40 /* ( */, 94,0, TRUE} // 33
|
||||
, {doPerlInline, 123 /* { */, 94,0, TRUE} // 34
|
||||
, {doBadOpenParenType, 255, 94,0, FALSE} // 35
|
||||
, {doOpenLookBehind, 61 /* = */, 2, 17, TRUE} // 36 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE} // 37
|
||||
, {doBadOpenParenType, 255, 94,0, FALSE} // 38
|
||||
, {doNOP, 41 /* ) */, 2,0, TRUE} // 39 paren-comment
|
||||
, {doMismatchedParenErr, 253, 94,0, FALSE} // 40
|
||||
, {doNOP, 255, 39,0, TRUE} // 41
|
||||
, {doMatchMode, 105 /* i */, 42,0, TRUE} // 42 paren-flag
|
||||
, {doMatchMode, 115 /* s */, 42,0, TRUE} // 43
|
||||
, {doMatchMode, 109 /* m */, 42,0, TRUE} // 44
|
||||
, {doMatchMode, 120 /* x */, 42,0, TRUE} // 45
|
||||
, {doMatchMode, 45 /* - */, 42,0, TRUE} // 46
|
||||
, {doNOP, 41 /* ) */, 2,0, TRUE} // 47
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 48
|
||||
, {doNOP, 255, 94,0, FALSE} // 49
|
||||
, {doNGStar, 63 /* ? */, 17,0, TRUE} // 50 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 17,0, TRUE} // 51
|
||||
, {doStar, 255, 17,0, FALSE} // 52
|
||||
, {doNGPlus, 63 /* ? */, 17,0, TRUE} // 53 quant-plus
|
||||
, {doPossesivePlus, 43 /* + */, 17,0, TRUE} // 54
|
||||
, {doPlus, 255, 17,0, FALSE} // 55
|
||||
, {doNGOpt, 63 /* ? */, 17,0, TRUE} // 56 quant-opt
|
||||
, {doPossesiveOpt, 43 /* + */, 17,0, TRUE} // 57
|
||||
, {doOpt, 255, 17,0, FALSE} // 58
|
||||
, {doNOP, 129, 59,0, TRUE} // 59 interval-open
|
||||
, {doNOP, 128, 62,0, FALSE} // 60
|
||||
, {doIntervalError, 255, 94,0, FALSE} // 61
|
||||
, {doIntevalLowerDigit, 128, 62,0, TRUE} // 62 interval-lower
|
||||
, {doNOP, 44 /* , */, 66,0, TRUE} // 63
|
||||
, {doIntervalSame, 125 /* } */, 69,0, TRUE} // 64
|
||||
, {doIntervalError, 255, 94,0, FALSE} // 65
|
||||
, {doIntervalUpperDigit, 128, 66,0, TRUE} // 66 interval-upper
|
||||
, {doNOP, 125 /* } */, 69,0, TRUE} // 67
|
||||
, {doIntervalError, 255, 94,0, FALSE} // 68
|
||||
, {doNGInterval, 63 /* ? */, 17,0, TRUE} // 69 interval-type
|
||||
, {doPossesiveInterval, 43 /* + */, 17,0, TRUE} // 70
|
||||
, {doInterval, 255, 17,0, FALSE} // 71
|
||||
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 72 backslash
|
||||
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 73
|
||||
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 74
|
||||
, {doBackslashd, 100 /* d */, 12,0, TRUE} // 75
|
||||
, {doBackslashD, 68 /* D */, 12,0, TRUE} // 76
|
||||
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 77
|
||||
, {doNamedChar, 78 /* N */, 12,0, TRUE} // 78
|
||||
, {doProperty, 112 /* p */, 12,0, FALSE} // 79
|
||||
, {doProperty, 80 /* P */, 12,0, FALSE} // 80
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 81
|
||||
, {doBackslashS, 83 /* S */, 12,0, TRUE} // 82
|
||||
, {doBackslashs, 115 /* s */, 12,0, TRUE} // 83
|
||||
, {doBackslashW, 87 /* W */, 12,0, TRUE} // 84
|
||||
, {doBackslashw, 119 /* w */, 12,0, TRUE} // 85
|
||||
, {doBackslashX, 88 /* X */, 12,0, TRUE} // 86
|
||||
, {doBackslashx, 120 /* x */, 12,0, TRUE} // 87
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 88
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 89
|
||||
, {doOctal, 48 /* 0 */, 12,0, TRUE} // 90
|
||||
, {doBackRef, 128, 12,0, TRUE} // 91
|
||||
, {doEscapeError, 253, 94,0, FALSE} // 92
|
||||
, {doLiteralChar, 255, 12,0, TRUE} // 93
|
||||
, {doExit, 255, 94,0, TRUE} // 94 errorDeath
|
||||
};
|
||||
static const char * const RegexStateNames[] = { 0,
|
||||
"start",
|
||||
|
@ -224,6 +230,8 @@ static const char * const RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"open-paren-lookbehind",
|
||||
0,
|
||||
|
@ -281,6 +289,7 @@ static const char * const RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"errorDeath",
|
||||
0};
|
||||
|
|
|
@ -119,6 +119,8 @@ open-paren-extended:
|
|||
's' n paren-flag doMatchMode
|
||||
'm' n paren-flag doMatchMode
|
||||
'-' n paren-flag doMatchMode
|
||||
'(' n errorDeath doConditionalExpr
|
||||
'{' n errorDeath doPerlInline
|
||||
default errorDeath doBadOpenParenType
|
||||
|
||||
open-paren-lookbehind:
|
||||
|
@ -230,8 +232,9 @@ backslash:
|
|||
'Z' n term doBackslashZ
|
||||
'z' n term doBackslashz
|
||||
'0' n expr-quant doOctal
|
||||
digit_char expr-quant doBackRef # Will scan multiple digits
|
||||
default n expr-quant doLiteralChar # Escaped literal char.
|
||||
digit_char n expr-quant doBackRef # Will scan multiple digits
|
||||
eof errorDeath doEscapeError
|
||||
default n expr-quant doLiteralChar # Escaped literal char.
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -24,8 +24,6 @@
|
|||
#include "uvectr32.h"
|
||||
#include "regeximp.h"
|
||||
|
||||
//#include "stdio.h"
|
||||
//#include "malloc.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -222,9 +220,14 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
|
|||
UBool RegexMatcher::find() {
|
||||
// Start at the position of the last match end. (Will be zero if the
|
||||
// matcher has been reset.
|
||||
//
|
||||
// TODO: Needs optimization
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
int32_t startPos;
|
||||
// TODO: needs to go up to the very end, so a pattern that can match a zero lenght
|
||||
// string can match at the end of a string. Can't do until loop-breaking
|
||||
// is added to the engine, though, otherwise it triggers too many bugs.
|
||||
for (startPos=fMatchEnd; startPos < fInputLength; startPos = fInput->moveIndex32(startPos, 1)) {
|
||||
MatchAt(startPos, status);
|
||||
if (U_FAILURE(status)) {
|
||||
|
@ -477,22 +480,27 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
|
|||
// We are at a boundary if the this char and the original chars are
|
||||
// opposite in membership in \w set
|
||||
//
|
||||
// parameters: pos - the current position in the input buffer
|
||||
// start - the position where the match operation started.
|
||||
// don't backup before this position when looking back
|
||||
// for a preceding base char.
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
UBool RegexMatcher::isWordBoundary(int32_t pos) {
|
||||
UBool isBoundary = FALSE;
|
||||
if (pos >= fInputLength) {
|
||||
// off end of string. Not a boundary.
|
||||
return FALSE;
|
||||
}
|
||||
UBool cIsWord = FALSE;
|
||||
|
||||
// Determine whether char c at Pos is a member of the word set of chars.
|
||||
UChar32 c = fInput->char32At(pos);
|
||||
int8_t ctype = u_charType(c);
|
||||
if (ctype==U_NON_SPACING_MARK || ctype==U_ENCLOSING_MARK) {
|
||||
// Current char is a combining one. Not a boundary.
|
||||
return FALSE;
|
||||
// Determine whether char c at current position is a member of the word set of chars.
|
||||
// If we're off the end of the string, behave as though we're not at a word char.
|
||||
if (pos < fInputLength) {
|
||||
UChar32 c = fInput->char32At(pos);
|
||||
int8_t ctype = u_charType(c);
|
||||
if (ctype==U_NON_SPACING_MARK || ctype==U_ENCLOSING_MARK) {
|
||||
// Current char is a combining one. Not a boundary.
|
||||
return FALSE;
|
||||
}
|
||||
cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
|
||||
}
|
||||
UBool cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
|
||||
|
||||
// Back up until we come to a non-combining char, determine whether
|
||||
// that char is a word char.
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "intltest.h"
|
||||
#include "regextst.h"
|
||||
#include "uvector.h"
|
||||
|
@ -59,6 +60,10 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
|
|||
case 5: name = "Errors";
|
||||
if (exec) Errors();
|
||||
break;
|
||||
case 6: name = "PerlTests";
|
||||
// if (exec) PerlTests();
|
||||
break;
|
||||
|
||||
|
||||
default: name = "";
|
||||
break; //needed to end loop
|
||||
|
@ -368,7 +373,7 @@ void RegexTest::Basic() {
|
|||
//
|
||||
#if 0
|
||||
{
|
||||
REGEX_TESTLM("(abc)*+a", "abcabcabc", FALSE, FALSE);
|
||||
REGEX_TESTLM("\\ba\\b", "-a", FALSE, TRUE);
|
||||
// REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
|
||||
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
|
||||
}
|
||||
|
@ -1109,6 +1114,8 @@ void RegexTest::Extended() {
|
|||
|
||||
// \b \B
|
||||
REGEX_FIND( ".*?\\b(.).*", "<0> $%^&*( <1>h</1>ello123%^&*()gxx</0>");
|
||||
REGEX_FIND( "\\ba\\b", "-<0>a</0>");
|
||||
REGEX_FIND("\\by\\b", "xy");
|
||||
|
||||
// Finds first chars of up to 5 words
|
||||
REGEX_FIND( "(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?",
|
||||
|
@ -1319,5 +1326,297 @@ void RegexTest::Errors() {
|
|||
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// PerlTests Run Perl's regexp tests.
|
||||
//
|
||||
//---------------------------------------------------------------------------
|
||||
static UBool ReplaceFirst(UnicodeString &target, const UnicodeString &pattern,
|
||||
const UnicodeString &replacement, UErrorCode &status)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return FALSE;
|
||||
}
|
||||
UParseError pe;
|
||||
RegexPattern *pat = NULL;
|
||||
RegexMatcher *mat = NULL;
|
||||
|
||||
pat = RegexPattern::compile(pattern, 0, pe, status);
|
||||
if (pat != NULL) {
|
||||
mat = pat->matcher(target, status);
|
||||
}
|
||||
if (mat != NULL) {
|
||||
target = mat->replaceFirst(replacement, status);
|
||||
}
|
||||
UBool retVal = (mat->start(0, status) != -1);
|
||||
delete mat;
|
||||
delete pat;
|
||||
return retVal;
|
||||
}
|
||||
|
||||
static char *cstar(const UnicodeString &s) {
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
static char buf[1000];
|
||||
s.extract(buf, 1000, NULL, status);
|
||||
buf[999] = 0;
|
||||
return buf;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// Read a text data file, convert it to UChars, and return the data
|
||||
// in one big UChar * buffer, which the caller must delete.
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
UChar *RegexTest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status) {
|
||||
UChar *retPtr = NULL;
|
||||
char *fileBuf = NULL;
|
||||
UConverter* conv = NULL;
|
||||
FILE *f = NULL;
|
||||
|
||||
ulen = 0;
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return retPtr;
|
||||
}
|
||||
|
||||
//
|
||||
// Open the file.
|
||||
//
|
||||
f = fopen(fileName, "rb");
|
||||
if (f == 0) {
|
||||
errln("Error opening test data file %s\n", fileName);
|
||||
goto cleanUpAndReturn;
|
||||
}
|
||||
//
|
||||
// Read it in
|
||||
//
|
||||
fseek( f, 0, SEEK_END);
|
||||
int fileSize = ftell(f);
|
||||
fileBuf = new char[fileSize];
|
||||
fseek(f, 0, SEEK_SET);
|
||||
int amt_read = fread(fileBuf, 1, fileSize, f);
|
||||
if (amt_read != fileSize || fileSize <= 0) {
|
||||
errln("Error reading test data file.");
|
||||
goto cleanUpAndReturn;
|
||||
}
|
||||
|
||||
//
|
||||
// Look for a Unicode Signature (BOM) on the data just read
|
||||
//
|
||||
int32_t signatureLength;
|
||||
const char * fileBufC = fileBuf;
|
||||
const char* encoding = ucnv_detectUnicodeSignature(
|
||||
fileBuf, fileSize, &signatureLength, &status);
|
||||
if(encoding!=NULL ){
|
||||
fileBufC += signatureLength;
|
||||
fileSize -= signatureLength;
|
||||
}
|
||||
|
||||
//
|
||||
// Open a converter to take the rule file to UTF-16
|
||||
//
|
||||
conv = ucnv_open(encoding, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
goto cleanUpAndReturn;
|
||||
}
|
||||
|
||||
//
|
||||
// Convert the rules to UChar.
|
||||
// Preflight first to determine required buffer size.
|
||||
//
|
||||
ulen = ucnv_toUChars(conv,
|
||||
NULL, // dest,
|
||||
0, // destCapacity,
|
||||
fileBufC,
|
||||
fileSize,
|
||||
&status);
|
||||
if (status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
// Buffer Overflow is expected from the preflight operation.
|
||||
status = U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
retPtr = new UChar[ulen+1];
|
||||
ucnv_toUChars(conv,
|
||||
retPtr, // dest,
|
||||
ulen+1,
|
||||
fileBufC,
|
||||
fileSize,
|
||||
&status);
|
||||
}
|
||||
|
||||
cleanUpAndReturn:
|
||||
fclose(f);
|
||||
delete fileBuf;
|
||||
ucnv_close(conv);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
|
||||
delete retPtr;
|
||||
retPtr = 0;
|
||||
ulen = 0;
|
||||
};
|
||||
return retPtr;
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// PerlTests - Run Perl's regular expression tests
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
void RegexTest::PerlTests() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UParseError pe;
|
||||
|
||||
//
|
||||
// Open and read the test data file.
|
||||
//
|
||||
const char *testDataDirectory = loadTestData(status);
|
||||
UnicodeString tdd(testDataDirectory);
|
||||
ReplaceFirst(tdd, "([/\\\\])out[/\\\\]testdata", "$1re_tests.txt", status);
|
||||
|
||||
int len;
|
||||
UChar *testData = ReadAndConvertFile(cstar(tdd), len, status);
|
||||
|
||||
//
|
||||
// Put the test data into a UnicodeString
|
||||
//
|
||||
UnicodeString ruleSourceS(FALSE, testData, len);
|
||||
|
||||
//
|
||||
// Regex to break the input file into lines, and strip the new lines.
|
||||
// One line per match, capture group one is the desired data.
|
||||
//
|
||||
RegexPattern* linePat = RegexPattern::compile("(.+?)[\\r\\n]+", 0, pe, status);
|
||||
RegexMatcher* lineMat = linePat->matcher(ruleSourceS, status);
|
||||
|
||||
//
|
||||
// Regex to split a test file line into fields.
|
||||
// There are six fields, separated by tabs.
|
||||
//
|
||||
RegexPattern* fieldPat = RegexPattern::compile("\\t", 0, pe, status);
|
||||
|
||||
//
|
||||
// Regex to identify test patterns with flag settings, and to separate them.
|
||||
// Test patterns with flags look like 'pattern'i
|
||||
// Test patterns without flags are not quoted: paterrn
|
||||
// Coming out, capture group 2 is the pattern, capture group 3 is the flags.
|
||||
//
|
||||
RegexPattern *flagPat = RegexPattern::compile("('?)(.*)\\1(.*)", 0, pe, status);
|
||||
RegexMatcher* flagMat = flagPat->matcher("", status);
|
||||
|
||||
//
|
||||
// Regex to find ${bang}. Perl doesn't put literal '!'s into patterns.
|
||||
//
|
||||
RegexPattern *bangPat = RegexPattern::compile("\\$\\{bang\\}", 0, pe, status);
|
||||
RegexMatcher *bangMat = bangPat->matcher("", status);
|
||||
|
||||
|
||||
int32_t lineNum = 0;
|
||||
int32_t skippedUnimplementedCount = 0;
|
||||
while (lineMat->find()) {
|
||||
lineNum++;
|
||||
UnicodeString line = lineMat->group(1, status);
|
||||
UnicodeString fields[7];
|
||||
fieldPat->split(line, fields, 7, status);
|
||||
|
||||
flagMat->reset(fields[0]);
|
||||
flagMat->matches(status);
|
||||
UnicodeString pattern = flagMat->group(2, status);
|
||||
bangMat->reset(pattern);
|
||||
pattern = bangMat->replaceAll("\\u0021", status);
|
||||
UnicodeString flagStr = flagMat->group(3, status);
|
||||
// printf("pattern = %s\n", cstar(pattern));
|
||||
// printf(" flags = %s\n", cstar(flags));
|
||||
if (U_FAILURE(status)) {
|
||||
errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t flags = 0;
|
||||
const UChar UChar_c = 0x63; // Damn the lack of Unicode support in C
|
||||
const UChar UChar_i = 0x69;
|
||||
const UChar UChar_m = 0x6d;
|
||||
const UChar UChar_x = 0x78;
|
||||
const UChar UChar_y = 0x79;
|
||||
if (flagStr.indexOf(UChar_i) != -1) {
|
||||
flags |= UREGEX_CASE_INSENSITIVE;
|
||||
}
|
||||
if (flagStr.indexOf(UChar_m) != -1) {
|
||||
flags |= UREGEX_MULTILINE;
|
||||
}
|
||||
if (flagStr.indexOf(UChar_x) != -1) {
|
||||
flags |= UREGEX_COMMENTS;
|
||||
}
|
||||
|
||||
//
|
||||
// Compile the test pattern.
|
||||
//
|
||||
status = U_ZERO_ERROR;
|
||||
RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
|
||||
if (status == U_REGEX_UNIMPLEMENTED) {
|
||||
skippedUnimplementedCount++;
|
||||
delete testPat;
|
||||
status = U_ZERO_ERROR;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
// Some tests are supposed to generate errors.
|
||||
// Only report an error for tests that are supposed to succeed.
|
||||
if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
|
||||
fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
|
||||
{
|
||||
errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
delete testPat;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (fields[2].indexOf(UChar_i) >= 0) {
|
||||
// ICU should skip this test.
|
||||
delete testPat;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (fields[2].indexOf(UChar_c) >= 0) {
|
||||
// This pattern should have caused a compilation error, but didn't/
|
||||
errln("line %d: Expected a pattern compile error, got success.", lineNum);
|
||||
delete testPat;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Run the test
|
||||
//
|
||||
RegexMatcher *testMat = testPat->matcher(fields[1], status);
|
||||
UBool found = testMat->find();
|
||||
UBool expected = FALSE;
|
||||
if (fields[2].indexOf(UChar_y) >=0) {
|
||||
expected = TRUE;
|
||||
}
|
||||
if (expected != found) {
|
||||
errln("line %d: Expected %smatch, got %smatch",
|
||||
lineNum, expected?"":"no ", found?"":"no " );
|
||||
}
|
||||
|
||||
|
||||
|
||||
delete testMat;
|
||||
delete testPat;
|
||||
}
|
||||
|
||||
logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
|
||||
|
||||
|
|
|
@ -30,12 +30,14 @@ public:
|
|||
virtual void Basic();
|
||||
virtual void Extended();
|
||||
virtual void Errors();
|
||||
virtual void PerlTests();
|
||||
|
||||
// The following functions are internal to the regexp tests.
|
||||
virtual UBool doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int line);
|
||||
virtual void regex_find(const char *pat, const char *input, UErrorCode expectedStatus, int line);
|
||||
virtual void regex_err(const char *pat, int32_t errline, int32_t errcol,
|
||||
UErrorCode expectedStatus, int line);
|
||||
virtual UChar *ReadAndConvertFile(const char *fileName, int &len, UErrorCode &status);
|
||||
};
|
||||
|
||||
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
|
Loading…
Add table
Reference in a new issue