ICU-105 Regular Expressions, ongoing development

X-SVN-Rev: 10157
This commit is contained in:
Andy Heninger 2002-11-06 02:35:20 +00:00
parent ee0d1cd5db
commit 96ec073b83
10 changed files with 329 additions and 120 deletions

View file

@ -1832,8 +1832,11 @@ _uBrkErrorName[U_BRK_ERROR_LIMIT - U_BRK_ERROR_START] = {
static const char * const
_uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
"U_REGEX_ERROR_START",
"U_REGEX_INTERNAL_ERROR"
"U_REGEX_INVALID_STATE"
"U_REGEX_INTERNAL_ERROR",
"U_REGEX_INVALID_STATE",
"U_REGEX_BAD_ESCAPE_SEQUENCE",
"U_REGEX_PROPERTY_SYNTAX",
"U_REGEX_UNIMPLEMENTED"
};
U_CAPI const char * U_EXPORT2

View file

@ -505,6 +505,7 @@ typedef enum UErrorCode {
U_REGEX_INVALID_STATE,
U_REGEX_BAD_ESCAPE_SEQUENCE,
U_REGEX_PROPERTY_SYNTAX,
U_REGEX_UNIMPLEMENTED,
U_REGEX_ERROR_LIMIT,
U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */

View file

@ -449,26 +449,28 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doOpenAtomicParen:
// Open Paren.
// Open Atomic Paren.
error(U_REGEX_UNIMPLEMENTED);
break;
case doOpenLookAhead:
// Open Paren.
error(U_REGEX_UNIMPLEMENTED);
break;
case doOpenLookAheadNeg:
// Open Paren.
error(U_REGEX_UNIMPLEMENTED);
break;
case doOpenLookBehind:
// Open Paren.
error(U_REGEX_UNIMPLEMENTED);
break;
case doOpenLookBehindNeg:
// Open Paren.
break;
case doExprRParen:
error(U_REGEX_UNIMPLEMENTED);
break;
case doCloseParen:
@ -702,6 +704,14 @@ UBool RegexCompile::doParseActions(EParseAction action)
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOTANY, 0), *fStatus);
break;
case doCaret: // TODO: multi-line mode flag.
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
break;
case doDollar: // TODO: multi-line mode flag.
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
break;
case doBackslashA:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus);
@ -751,8 +761,15 @@ UBool RegexCompile::doParseActions(EParseAction action)
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
break;
case doBackslashx: // \x{abcd} alternate hex format
// TODO: implement
error(U_REGEX_UNIMPLEMENTED);
break;
case doBackslashZ:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 1), *fStatus);
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
break;
case doBackslashz:
@ -782,6 +799,16 @@ UBool RegexCompile::doParseActions(EParseAction action)
// Just scanned a \Q. Put character scanner into quote mode.
fQuoteMode = TRUE;
break;
case doBackRef:
// TODO: implement back references.
error(U_REGEX_UNIMPLEMENTED);
break;
case doNamedChar: // \N{NAMED_CHAR}
// TODO: implement
error(U_REGEX_UNIMPLEMENTED);
break;
default:
error(U_BRK_INTERNAL_ERROR);
@ -951,7 +978,7 @@ void RegexCompile::error(UErrorCode e) {
*fStatus = e;
fParseErr->line = fLineNum;
fParseErr->offset = fCharNum;
fParseErr->preContext[0] = 0;
fParseErr->preContext[0] = 0; // TODO: copy in some input pattern text
fParseErr->preContext[0] = 0;
}
}
@ -959,11 +986,6 @@ void RegexCompile::error(UErrorCode e) {
//
// Assorted Unicode character constants.
// Numeric because there is no portable way to enter them as literals.

View file

@ -22,7 +22,6 @@ U_NAMESPACE_BEGIN
enum Regex_PatternParseAction {
doExprOrOperator,
doCloseParen,
doProperty,
doTagValue,
@ -33,12 +32,14 @@ enum Regex_PatternParseAction {
doBackslashs,
doStartString,
doNGOpt,
doNamedChar,
doBackslashw,
doPossesiveStar,
doOpenLookBehind,
doExprRParen,
doBackslashx,
doBackslashz,
doStar,
doCaret,
doEnterQuoteMode,
doPossesivePlus,
doNGStar,
@ -57,9 +58,11 @@ enum Regex_PatternParseAction {
doOpt,
doOpenAtomicParen,
doBackslashS,
doNumberExpectedError,
doStringChar,
doOpenLookAhead,
doNumberExpectedError,
doBackRef,
doDollar,
doDotAny,
doBackslashW,
doBackslashX,
@ -94,80 +97,84 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
{doNOP, 0, 0, 0, TRUE}
, {doPatStart, 255, 3, 2, FALSE} // 1 start
, {doPatFinish, 255, 2,0, FALSE} // 2 finish
, {doStartString, 254, 11,0, TRUE} // 3 term
, {doStartString, 130, 11,0, TRUE} // 4
, {doScanUnicodeSet, 91 /* [ */, 18,0, TRUE} // 5
, {doNOP, 40 /* ( */, 25, 18, TRUE} // 6
, {doDotAny, 46 /* . */, 18,0, TRUE} // 7
, {doNOP, 92 /* \ */, 59,0, TRUE} // 8
, {doNOP, 253, 2,0, FALSE} // 9
, {doRuleError, 255, 76,0, FALSE} // 10
, {doStringChar, 254, 11,0, TRUE} // 11 string
, {doStringChar, 130, 11,0, TRUE} // 12
, {doSplitString, 63 /* ? */, 18,0, FALSE} // 13
, {doSplitString, 43 /* + */, 18,0, FALSE} // 14
, {doSplitString, 42 /* * */, 18,0, FALSE} // 15
, {doSplitString, 123 /* { */, 18,0, FALSE} // 16
, {doEndString, 255, 18,0, FALSE} // 17
, {doNOP, 42 /* * */, 36,0, TRUE} // 18 expr-quant
, {doNOP, 43 /* + */, 39,0, TRUE} // 19
, {doNOP, 63 /* ? */, 42,0, TRUE} // 20
, {doNOP, 255, 22,0, FALSE} // 21
, {doOrOperator, 124 /* | */, 3,0, TRUE} // 22 expr-cont
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 23
, {doNOP, 255, 3,0, FALSE} // 24
, {doNOP, 63 /* ? */, 27,0, TRUE} // 25 open-paren
, {doOpenCaptureParen, 255, 3, 18, FALSE} // 26
, {doOpenNonCaptureParen, 58 /* : */, 3, 18, TRUE} // 27 open-paren-extended
, {doOpenAtomicParen, 62 /* > */, 3, 18, TRUE} // 28
, {doOpenLookAhead, 61 /* = */, 3, 22, TRUE} // 29
, {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE} // 30
, {doNOP, 60 /* < */, 33,0, TRUE} // 31
, {doBadOpenParenType, 255, 76,0, FALSE} // 32
, {doOpenLookBehind, 61 /* = */, 3, 22, TRUE} // 33 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE} // 34
, {doBadOpenParenType, 255, 76,0, FALSE} // 35
, {doNGStar, 63 /* ? */, 22,0, TRUE} // 36 quant-star
, {doPossesiveStar, 43 /* + */, 22,0, TRUE} // 37
, {doStar, 255, 22,0, FALSE} // 38
, {doNGPlus, 63 /* ? */, 22,0, TRUE} // 39 quant-plus
, {doPossesivePlus, 43 /* + */, 22,0, TRUE} // 40
, {doPlus, 255, 22,0, FALSE} // 41
, {doNGOpt, 63 /* ? */, 22,0, TRUE} // 42 quant-opt
, {doPossesiveOpt, 43 /* + */, 22,0, TRUE} // 43
, {doOpt, 255, 22,0, FALSE} // 44
, {doNOP, 129, 45,0, TRUE} // 45 interval-open
, {doIntervalMinValue, 128, 48,0, FALSE} // 46
, {doNumberExpectedError, 255, 76,0, FALSE} // 47
, {doNOP, 129, 52,0, TRUE} // 48 interval-value
, {doNOP, 125 /* } */, 52,0, FALSE} // 49
, {doIntervalDigit, 128, 48,0, TRUE} // 50
, {doNumberExpectedError, 255, 76,0, FALSE} // 51
, {doNOP, 129, 52,0, TRUE} // 52 interval-close
, {doTagValue, 125 /* } */, 55,0, TRUE} // 53
, {doNumberExpectedError, 255, 76,0, FALSE} // 54
, {doNOP, 254, 3,0, FALSE} // 55 expr-cont-no-interval
, {doExprOrOperator, 124 /* | */, 3,0, TRUE} // 56
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57
, {doNOP, 255, 3,0, FALSE} // 58
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 59 backslash
, {doBackslashB, 66 /* B */, 3,0, TRUE} // 60
, {doBackslashb, 98 /* b */, 3,0, TRUE} // 61
, {doBackslashd, 100 /* d */, 18,0, TRUE} // 62
, {doBackslashD, 68 /* D */, 18,0, TRUE} // 63
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 64
, {doProperty, 112 /* p */, 18,0, FALSE} // 65
, {doProperty, 80 /* P */, 18,0, FALSE} // 66
, {doEnterQuoteMode, 81 /* Q */, 3,0, TRUE} // 67
, {doBackslashS, 83 /* S */, 18,0, TRUE} // 68
, {doBackslashs, 115 /* s */, 18,0, TRUE} // 69
, {doBackslashW, 87 /* W */, 18,0, TRUE} // 70
, {doBackslashw, 119 /* w */, 18,0, TRUE} // 71
, {doBackslashX, 88 /* X */, 18,0, TRUE} // 72
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 73
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 74
, {doStartString, 255, 11,0, TRUE} // 75
, {doExit, 255, 76,0, TRUE} // 76 errorDeath
, {doStartString, 254, 13,0, TRUE} // 3 term
, {doStartString, 130, 13,0, TRUE} // 4
, {doScanUnicodeSet, 91 /* [ */, 20,0, TRUE} // 5
, {doNOP, 40 /* ( */, 27, 20, TRUE} // 6
, {doDotAny, 46 /* . */, 20,0, TRUE} // 7
, {doCaret, 94 /* ^ */, 3,0, TRUE} // 8
, {doDollar, 36 /* $ */, 3,0, TRUE} // 9
, {doNOP, 92 /* \ */, 60,0, TRUE} // 10
, {doNOP, 253, 2,0, FALSE} // 11
, {doRuleError, 255, 80,0, FALSE} // 12
, {doStringChar, 254, 13,0, TRUE} // 13 string
, {doStringChar, 130, 13,0, TRUE} // 14
, {doSplitString, 63 /* ? */, 20,0, FALSE} // 15
, {doSplitString, 43 /* + */, 20,0, FALSE} // 16
, {doSplitString, 42 /* * */, 20,0, FALSE} // 17
, {doSplitString, 123 /* { */, 20,0, FALSE} // 18
, {doEndString, 255, 20,0, FALSE} // 19
, {doNOP, 42 /* * */, 41,0, TRUE} // 20 expr-quant
, {doNOP, 43 /* + */, 44,0, TRUE} // 21
, {doNOP, 63 /* ? */, 47,0, TRUE} // 22
, {doNOP, 255, 24,0, FALSE} // 23
, {doOrOperator, 124 /* | */, 3,0, TRUE} // 24 expr-cont
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 25
, {doNOP, 255, 3,0, FALSE} // 26
, {doNOP, 63 /* ? */, 29,0, TRUE} // 27 open-paren
, {doOpenCaptureParen, 255, 3, 20, FALSE} // 28
, {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 29 open-paren-extended
, {doOpenAtomicParen, 62 /* > */, 3, 20, TRUE} // 30
, {doOpenLookAhead, 61 /* = */, 3, 24, TRUE} // 31
, {doOpenLookAheadNeg, 33 /* ! */, 3, 24, TRUE} // 32
, {doNOP, 60 /* < */, 36,0, TRUE} // 33
, {doNOP, 35 /* # */, 39,0, TRUE} // 34
, {doBadOpenParenType, 255, 80,0, FALSE} // 35
, {doOpenLookBehind, 61 /* = */, 3, 24, TRUE} // 36 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 3, 24, TRUE} // 37
, {doBadOpenParenType, 255, 80,0, FALSE} // 38
, {doNOP, 41 /* ) */, 3,0, TRUE} // 39 paren-comment
, {doNOP, 255, 39,0, TRUE} // 40
, {doNGStar, 63 /* ? */, 24,0, TRUE} // 41 quant-star
, {doPossesiveStar, 43 /* + */, 24,0, TRUE} // 42
, {doStar, 255, 24,0, FALSE} // 43
, {doNGPlus, 63 /* ? */, 24,0, TRUE} // 44 quant-plus
, {doPossesivePlus, 43 /* + */, 24,0, TRUE} // 45
, {doPlus, 255, 24,0, FALSE} // 46
, {doNGOpt, 63 /* ? */, 24,0, TRUE} // 47 quant-opt
, {doPossesiveOpt, 43 /* + */, 24,0, TRUE} // 48
, {doOpt, 255, 24,0, FALSE} // 49
, {doNOP, 129, 50,0, TRUE} // 50 interval-open
, {doIntervalMinValue, 128, 53,0, FALSE} // 51
, {doNumberExpectedError, 255, 80,0, FALSE} // 52
, {doNOP, 129, 57,0, TRUE} // 53 interval-value
, {doNOP, 125 /* } */, 57,0, FALSE} // 54
, {doIntervalDigit, 128, 53,0, TRUE} // 55
, {doNumberExpectedError, 255, 80,0, FALSE} // 56
, {doNOP, 129, 57,0, TRUE} // 57 interval-close
, {doTagValue, 125 /* } */, 24,0, TRUE} // 58
, {doNumberExpectedError, 255, 80,0, FALSE} // 59
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 60 backslash
, {doBackslashB, 66 /* B */, 3,0, TRUE} // 61
, {doBackslashb, 98 /* b */, 3,0, TRUE} // 62
, {doBackslashd, 100 /* d */, 20,0, TRUE} // 63
, {doBackslashD, 68 /* D */, 20,0, TRUE} // 64
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 65
, {doNamedChar, 78 /* N */, 20,0, TRUE} // 66
, {doProperty, 112 /* p */, 20,0, FALSE} // 67
, {doProperty, 80 /* P */, 20,0, FALSE} // 68
, {doEnterQuoteMode, 81 /* Q */, 3,0, TRUE} // 69
, {doBackslashS, 83 /* S */, 20,0, TRUE} // 70
, {doBackslashs, 115 /* s */, 20,0, TRUE} // 71
, {doBackslashW, 87 /* W */, 20,0, TRUE} // 72
, {doBackslashw, 119 /* w */, 20,0, TRUE} // 73
, {doBackslashX, 88 /* X */, 20,0, TRUE} // 74
, {doBackslashx, 120 /* x */, 20,0, TRUE} // 75
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 76
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 77
, {doBackRef, 128, 20,0, TRUE} // 78
, {doStartString, 255, 13,0, TRUE} // 79
, {doExit, 255, 80,0, TRUE} // 80 errorDeath
};
static const char *RegexStateNames[] = { 0,
"start",
@ -179,6 +186,8 @@ static const char *RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
0,
"string",
0,
@ -201,9 +210,12 @@ static const char *RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
"open-paren-lookbehind",
0,
0,
"paren-comment",
0,
"quant-star",
0,
@ -223,10 +235,6 @@ static const char *RegexStateNames[] = { 0,
0,
"interval-close",
0,
0,
"expr-cont-no-interval",
0,
0,
0,
"backslash",
0,
@ -244,6 +252,9 @@ static const char *RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
0,
0,
"errorDeath",
0};

View file

@ -77,6 +77,8 @@ term:
'[' n expr-quant doScanUnicodeSet
'(' n open-paren ^expr-quant
'.' n expr-quant doDotAny
'^' n term doCaret
'$' n term doDollar
'\' n backslash
eof finish
default errorDeath doRuleError
@ -133,6 +135,7 @@ open-paren-extended:
'=' n term ^expr-cont doOpenLookAhead # (?=
'!' n term ^expr-cont doOpenLookAheadNeg # (?!
'<' n open-paren-lookbehind
'#' n paren-comment
default errorDeath doBadOpenParenType
open-paren-lookbehind:
@ -141,6 +144,15 @@ open-paren-lookbehind:
default errorDeath doBadOpenParenType
#
# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
# TODO: should parens nest here? Check what perl does.
#
paren-comment:
')' n term
default n paren-comment
#
# quant-star Scanning a '*' quantifier. Need to look ahead to decide
# between plain '*', '*?', '*+'
@ -188,23 +200,11 @@ interval-value:
interval-close:
white_space n interval-close
'}' n expr-cont-no-interval doTagValue
'}' n expr-cont doTagValue
default errorDeath doNumberExpectedError
#
# expr-cont-no-tag Expression, continuation. At a point where additional terms are
# allowed, but not required. Just like
# expr-cont, above, except that no interval
# specification {min, max} is permitted.
#
expr-cont-no-interval:
quoted term
'|' n term doExprOrOperator
')' n pop doExprRParen
default term
#
# backslash # Backslash. Figure out which of the \thingies we have encountered.
@ -217,6 +217,7 @@ backslash:
'd' n expr-quant doBackslashd
'D' n expr-quant doBackslashD
'G' n term doBackslashG
'N' n expr-quant doNamedChar # \N{NAME} named char
'p' expr-quant doProperty # \p{Lu} style property
'P' expr-quant doProperty
'Q' n term doEnterQuoteMode
@ -225,8 +226,10 @@ backslash:
'W' n expr-quant doBackslashW
'w' n expr-quant doBackslashw
'X' n expr-quant doBackslashX
'x' n expr-quant doBackslashx
'Z' n term doBackslashZ
'z' n term doBackslashz
digit_char n expr-quant doBackRef
default n string doStartString

View file

@ -35,13 +35,15 @@ static const uint32_t URX_FAIL = 14; // Stop match operation; No
static const uint32_t URX_BACKSLASH_A = 15;
static const uint32_t URX_BACKSLASH_B = 16; // Value field: 0: \b 1: \B
static const uint32_t URX_BACKSLASH_D = 22; // Value field: 0: \d 1: \D
static const uint32_t URX_BACKSLASH_G = 17;
static const uint32_t URX_BACKSLASH_W = 18; // Value field: 0: \w 1: \W
static const uint32_t URX_BACKSLASH_X = 19;
static const uint32_t URX_BACKSLASH_Z = 20; // Value field: 0: \z 1: \Z
static const uint32_t URX_BACKSLASH_Z = 20; // \z Unconditional end of line.
static const uint32_t URX_DOTANY_ALL = 21; // ., in the . matches any mode.
static const uint32_t URX_BACKSLASH_D = 22; // Value field: 0: \d 1: \D
static const uint32_t URX_CARET = 23; // Value field: 1: multi-line mode.
static const uint32_t URX_DOLLAR = 24; // Also for \Z
//

View file

@ -560,9 +560,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
// Cache frequently referenced items from the compiled pattern
// in local variables.
//
UVector *pat = fPattern->fCompiledPat;
const UnicodeString *litText = &fPattern->fLiteralText;
UVector *sets = fPattern->fSets;
UVector *pat = fPattern->fCompiledPat;
const UnicodeString *litText = &fPattern->fLiteralText;
UVector *sets = fPattern->fSets;
int32_t inputLen = fInput->length();
//
@ -658,6 +659,46 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
fCaptureEnds->setElementAt(inputIdx, opValue);
break;
case URX_DOLLAR: // $, test for End of line
// or for position before new line at end of input
if (inputIdx < inputLen-2) {
// We are no where near the end of input. Fail.
backTrack(inputIdx, patIdx);
break;
}
if (inputIdx >= inputLen) {
// We really are at the end of input. Success.
break;
}
// If we are positioned just before a new-line that is located at the
// end of input, succeed.
if (inputIdx == inputLen-1) {
UChar32 c = fInput->char32At(inputIdx);
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
break; // At new-line at end of input. Success
}
}
if (inputIdx == inputLen-2) {
if (fInput->char32At(inputIdx) == 0x0d && fInput->char32At(inputIdx+1) == 0x0a) {
break; // At CR/LF at end of input. Success
}
}
backTrack(inputIdx, patIdx);
// TODO: support for multi-line mode.
break;
case URX_CARET: // ^, test for start of line
if (inputIdx != 0) {
backTrack(inputIdx, patIdx);
} // TODO: support for multi-line mode.
break;
case URX_BACKSLASH_A: // Test for start of input
if (inputIdx != 0) {
backTrack(inputIdx, patIdx);
@ -731,18 +772,24 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
for(;;) {
c = fInput->char32At(inputIdx);
ctype = u_charType(c);
// TODO: make a set and add the "othe grapheme extend" chars
// TODO: make a set and add the "other grapheme extend" chars
// to the list of stuff to be skipped over.
if (!(ctype == U_NON_SPACING_MARK || ctype == U_ENCLOSING_MARK)) {
break;
}
inputIdx = fInput->moveIndex32(inputIdx, 1);
if (inputIdx >= fInputLength) {
break;
}
}
}
}
break;
case URX_BACKSLASH_Z: // Test for end of line
if (FALSE) {
if (inputIdx < inputLen) {
backTrack(inputIdx, patIdx);
}
break;

View file

@ -421,7 +421,11 @@ static char *opNames[] = {
"URX_BACKSLASH_G",
"URX_BACKSLASH_W",
"URX_BACKSLASH_X",
"URX_BACKSLASH_Z"
"URX_BACKSLASH_Z",
"URX_DOTANY_ALL",
"URX_BACKSLASH_D",
"URX_CARET",
"URX_DOLLAR"
};
void RegexPattern::dump() {
@ -470,8 +474,11 @@ void RegexPattern::dump() {
case URX_STATE_SAVE:
case URX_JMP:
case URX_BACKSLASH_B:
case URX_BACKSLASH_D:
case URX_BACKSLASH_W:
case URX_BACKSLASH_Z:
case URX_CARET:
case URX_DOLLAR:
// types with an integer operand field.
printf("%d", val);
break;

View file

@ -54,6 +54,10 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
case 4: name = "Extended";
if (exec) Extended();
break;
case 5: name = "Errors";
if (exec) Errors();
break;
default: name = "";
break; //needed to end loop
}
@ -163,7 +167,7 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
//
// usage:
// REGEX_FIND("pattern", "input text");
// REGEX_FIND_S("pattern", "input text", expected status);
// REGEX_ERR("pattern", expected status);
//
// The input text is unescaped. The pattern is not.
// The input text is marked with the expected match positions
@ -177,7 +181,6 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
// REGEX_FIND is invoked via a macro, which allows capturing the source file line
// number for use in error messages.
#define REGEX_FIND(pat, text) regex_find(pat, text, U_ZERO_ERROR, __LINE__);
#define REGEX_FIND_S(pat, text, status) regex_find(pat, text, status, __LINE__);
// Set a value into a UVector at position specified by a decimal number in
@ -301,6 +304,52 @@ cleanupAndReturn:
}
//---------------------------------------------------------------------------
//
// REGEX_ERR Macro + invocation function to simplify writing tests
// regex tests for incorrect patterns
//
// usage:
// REGEX_ERR("pattern", expected error line, column, expected status);
//
//---------------------------------------------------------------------------
#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
UErrorCode expectedStatus, int line) {
UnicodeString pattern(pat);
UErrorCode status = U_ZERO_ERROR;
UParseError pe;
RegexPattern *callerPattern = NULL;
//
// Compile the caller's pattern
//
UnicodeString patString(pat);
callerPattern = RegexPattern::compile(patString, 0, pe, status);
if (status != expectedStatus) {
errln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
} else {
if (status != U_ZERO_ERROR) {
if (pe.line != errLine || pe.offset != errCol) {
errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
line, errLine, errCol, pe.line, pe.offset);
}
}
}
delete callerPattern;
}
//---------------------------------------------------------------------------
//
// Basic Check for basic functionality of regex pattern matching.
@ -429,8 +478,8 @@ void RegexTest::Basic() {
// REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
// REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L (or whatever) TODO: bug in Unescape
// REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape TODO: bug in Unescape
REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
@ -1087,7 +1136,66 @@ void RegexTest::Extended() {
// \X consume one combining char sequence.
REGEX_FIND("(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?",
"<0><1>A</1><2>B</2><3> </3></0>");
"<0><1>A</1><2>B</2><3> </3><4>\\r\\n</4></0>");
REGEX_FIND("(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?",
"<0><1>A\\u0301</1><2>\n</2><3>\\u0305</3><4>a\\u0302\\u0303\\u0304</4></0>");
// ^ matches only at beginning of line
REGEX_FIND(".*^(Hello)", "<0><1>Hello</1></0> Hello Hello Hello Goodbye");
REGEX_FIND(".*(Hello)", "<0>Hello Hello Hello <1>Hello</1></0> Goodbye");
REGEX_FIND(".*^(Hello)", " Hello Hello Hello Hello Goodbye"); // No Match
// $ matches only at end of line, or before a newline preceding the end of line
REGEX_FIND(".*?(Goodbye)$", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>");
REGEX_FIND(".*?(Goodbye)", "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye");
REGEX_FIND(".*?(Goodbye)$", "Hello Goodbye> Goodbye Goodbye "); // No Match
REGEX_FIND(".*?(Goodbye)$", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\n");
REGEX_FIND(".*?(Goodbye)$", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\n");
REGEX_FIND(".*?(Goodbye)$", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\r\\n");
REGEX_FIND(".*?(Goodbye)$", "Hello Goodbye Goodbye Goodbye\\n\\n"); // No Match
// \Z matches at end of input, like $ with default flags.
REGEX_FIND(".*?(Goodbye)\\Z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>");
REGEX_FIND(".*?(Goodbye)", "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye");
REGEX_FIND(".*?(Goodbye)\\Z", "Hello Goodbye> Goodbye Goodbye "); // No Match
REGEX_FIND("here$", "here\\nthe end"); // No Match
REGEX_FIND(".*?(Goodbye)\\Z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\n");
REGEX_FIND(".*?(Goodbye)\\Z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\n");
REGEX_FIND(".*?(Goodbye)\\Z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\r\\n");
REGEX_FIND(".*?(Goodbye)\\Z", "Hello Goodbye Goodbye Goodbye\\n\\n"); // No Match
// \z matches only at the end of string.
// no special treatment of new lines.
// no dependencies on flag settings.
REGEX_FIND(".*?(Goodbye)\\z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>");
REGEX_FIND(".*?(Goodbye)\\z", "Hello Goodbye Goodbye Goodbye "); // No Match
REGEX_FIND("here$", "here\\nthe end"); // No Match
REGEX_FIND(".*?(Goodbye)\\z", "Hello Goodbye Goodbye Goodbye\\n"); // No Match
REGEX_FIND(".*?(Goodbye)\\n\\z", "<0>Hello Goodbye Goodbye <1>Goodbye</1>\\n</0>");
// (?# comment) doesn't muck up pattern
REGEX_FIND("Hello (?# this is a comment) world", " <0>Hello world</0>...");
}
//---------------------------------------------------------------------------
//
// Errors Check for error handling in patterns.
//
//---------------------------------------------------------------------------
void RegexTest::Errors() {
// \escape sequences that aren't implemented yet.
REGEX_ERR("No (support) for \\1 BackReferences yet.", 1, 19, U_REGEX_UNIMPLEMENTED);
REGEX_ERR("named chars \\N{GREEK CAPITAL LETTER ALPHA} not implementd", 1, 14, U_REGEX_UNIMPLEMENTED);
REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
// Missing close parentheses
//REGEX_ERR("Comment (?# with no close", 1, 0, U_REGEX_INTERNAL_ERROR);
}

View file

@ -21,13 +21,18 @@ public:
virtual void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL );
// The following are test functions that are visible from the intltest test framework.
virtual void API_Match();
virtual void API_Pattern();
virtual void API_Replace();
virtual void Basic();
virtual void Extended();
virtual void Errors();
// The following functions are internal to the regexp tests.
virtual UBool doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int line);
virtual void regex_find(const char *pat, const char *input, UErrorCode expectedStatus, int line);
virtual void regex_err(const char *pat, int32_t errline, int32_t errcol,
UErrorCode expectedStatus, int line);
};
#endif