mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-105 Regular Expressions, ongoing development
X-SVN-Rev: 10157
This commit is contained in:
parent
ee0d1cd5db
commit
96ec073b83
10 changed files with 329 additions and 120 deletions
|
@ -1832,8 +1832,11 @@ _uBrkErrorName[U_BRK_ERROR_LIMIT - U_BRK_ERROR_START] = {
|
|||
static const char * const
|
||||
_uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
|
||||
"U_REGEX_ERROR_START",
|
||||
"U_REGEX_INTERNAL_ERROR"
|
||||
"U_REGEX_INVALID_STATE"
|
||||
"U_REGEX_INTERNAL_ERROR",
|
||||
"U_REGEX_INVALID_STATE",
|
||||
"U_REGEX_BAD_ESCAPE_SEQUENCE",
|
||||
"U_REGEX_PROPERTY_SYNTAX",
|
||||
"U_REGEX_UNIMPLEMENTED"
|
||||
};
|
||||
|
||||
U_CAPI const char * U_EXPORT2
|
||||
|
|
|
@ -505,6 +505,7 @@ typedef enum UErrorCode {
|
|||
U_REGEX_INVALID_STATE,
|
||||
U_REGEX_BAD_ESCAPE_SEQUENCE,
|
||||
U_REGEX_PROPERTY_SYNTAX,
|
||||
U_REGEX_UNIMPLEMENTED,
|
||||
U_REGEX_ERROR_LIMIT,
|
||||
|
||||
U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
|
||||
|
|
|
@ -449,26 +449,28 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
|
||||
|
||||
case doOpenAtomicParen:
|
||||
// Open Paren.
|
||||
// Open Atomic Paren.
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
case doOpenLookAhead:
|
||||
// Open Paren.
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
case doOpenLookAheadNeg:
|
||||
// Open Paren.
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
case doOpenLookBehind:
|
||||
// Open Paren.
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
case doOpenLookBehindNeg:
|
||||
// Open Paren.
|
||||
break;
|
||||
|
||||
case doExprRParen:
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
case doCloseParen:
|
||||
|
@ -702,6 +704,14 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOTANY, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case doCaret: // TODO: multi-line mode flag.
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
|
||||
break;
|
||||
|
||||
|
||||
case doDollar: // TODO: multi-line mode flag.
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashA:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus);
|
||||
|
@ -751,8 +761,15 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashx: // \x{abcd} alternate hex format
|
||||
// TODO: implement
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case doBackslashZ:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 1), *fStatus);
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashz:
|
||||
|
@ -782,6 +799,16 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
// Just scanned a \Q. Put character scanner into quote mode.
|
||||
fQuoteMode = TRUE;
|
||||
break;
|
||||
|
||||
case doBackRef:
|
||||
// TODO: implement back references.
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
case doNamedChar: // \N{NAMED_CHAR}
|
||||
// TODO: implement
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
default:
|
||||
error(U_BRK_INTERNAL_ERROR);
|
||||
|
@ -951,7 +978,7 @@ void RegexCompile::error(UErrorCode e) {
|
|||
*fStatus = e;
|
||||
fParseErr->line = fLineNum;
|
||||
fParseErr->offset = fCharNum;
|
||||
fParseErr->preContext[0] = 0;
|
||||
fParseErr->preContext[0] = 0; // TODO: copy in some input pattern text
|
||||
fParseErr->preContext[0] = 0;
|
||||
}
|
||||
}
|
||||
|
@ -959,11 +986,6 @@ void RegexCompile::error(UErrorCode e) {
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Assorted Unicode character constants.
|
||||
// Numeric because there is no portable way to enter them as literals.
|
||||
|
|
|
@ -22,7 +22,6 @@ U_NAMESPACE_BEGIN
|
|||
|
||||
|
||||
enum Regex_PatternParseAction {
|
||||
doExprOrOperator,
|
||||
doCloseParen,
|
||||
doProperty,
|
||||
doTagValue,
|
||||
|
@ -33,12 +32,14 @@ enum Regex_PatternParseAction {
|
|||
doBackslashs,
|
||||
doStartString,
|
||||
doNGOpt,
|
||||
doNamedChar,
|
||||
doBackslashw,
|
||||
doPossesiveStar,
|
||||
doOpenLookBehind,
|
||||
doExprRParen,
|
||||
doBackslashx,
|
||||
doBackslashz,
|
||||
doStar,
|
||||
doCaret,
|
||||
doEnterQuoteMode,
|
||||
doPossesivePlus,
|
||||
doNGStar,
|
||||
|
@ -57,9 +58,11 @@ enum Regex_PatternParseAction {
|
|||
doOpt,
|
||||
doOpenAtomicParen,
|
||||
doBackslashS,
|
||||
doNumberExpectedError,
|
||||
doStringChar,
|
||||
doOpenLookAhead,
|
||||
doNumberExpectedError,
|
||||
doBackRef,
|
||||
doDollar,
|
||||
doDotAny,
|
||||
doBackslashW,
|
||||
doBackslashX,
|
||||
|
@ -94,80 +97,84 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
{doNOP, 0, 0, 0, TRUE}
|
||||
, {doPatStart, 255, 3, 2, FALSE} // 1 start
|
||||
, {doPatFinish, 255, 2,0, FALSE} // 2 finish
|
||||
, {doStartString, 254, 11,0, TRUE} // 3 term
|
||||
, {doStartString, 130, 11,0, TRUE} // 4
|
||||
, {doScanUnicodeSet, 91 /* [ */, 18,0, TRUE} // 5
|
||||
, {doNOP, 40 /* ( */, 25, 18, TRUE} // 6
|
||||
, {doDotAny, 46 /* . */, 18,0, TRUE} // 7
|
||||
, {doNOP, 92 /* \ */, 59,0, TRUE} // 8
|
||||
, {doNOP, 253, 2,0, FALSE} // 9
|
||||
, {doRuleError, 255, 76,0, FALSE} // 10
|
||||
, {doStringChar, 254, 11,0, TRUE} // 11 string
|
||||
, {doStringChar, 130, 11,0, TRUE} // 12
|
||||
, {doSplitString, 63 /* ? */, 18,0, FALSE} // 13
|
||||
, {doSplitString, 43 /* + */, 18,0, FALSE} // 14
|
||||
, {doSplitString, 42 /* * */, 18,0, FALSE} // 15
|
||||
, {doSplitString, 123 /* { */, 18,0, FALSE} // 16
|
||||
, {doEndString, 255, 18,0, FALSE} // 17
|
||||
, {doNOP, 42 /* * */, 36,0, TRUE} // 18 expr-quant
|
||||
, {doNOP, 43 /* + */, 39,0, TRUE} // 19
|
||||
, {doNOP, 63 /* ? */, 42,0, TRUE} // 20
|
||||
, {doNOP, 255, 22,0, FALSE} // 21
|
||||
, {doOrOperator, 124 /* | */, 3,0, TRUE} // 22 expr-cont
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 23
|
||||
, {doNOP, 255, 3,0, FALSE} // 24
|
||||
, {doNOP, 63 /* ? */, 27,0, TRUE} // 25 open-paren
|
||||
, {doOpenCaptureParen, 255, 3, 18, FALSE} // 26
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 3, 18, TRUE} // 27 open-paren-extended
|
||||
, {doOpenAtomicParen, 62 /* > */, 3, 18, TRUE} // 28
|
||||
, {doOpenLookAhead, 61 /* = */, 3, 22, TRUE} // 29
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE} // 30
|
||||
, {doNOP, 60 /* < */, 33,0, TRUE} // 31
|
||||
, {doBadOpenParenType, 255, 76,0, FALSE} // 32
|
||||
, {doOpenLookBehind, 61 /* = */, 3, 22, TRUE} // 33 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE} // 34
|
||||
, {doBadOpenParenType, 255, 76,0, FALSE} // 35
|
||||
, {doNGStar, 63 /* ? */, 22,0, TRUE} // 36 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 22,0, TRUE} // 37
|
||||
, {doStar, 255, 22,0, FALSE} // 38
|
||||
, {doNGPlus, 63 /* ? */, 22,0, TRUE} // 39 quant-plus
|
||||
, {doPossesivePlus, 43 /* + */, 22,0, TRUE} // 40
|
||||
, {doPlus, 255, 22,0, FALSE} // 41
|
||||
, {doNGOpt, 63 /* ? */, 22,0, TRUE} // 42 quant-opt
|
||||
, {doPossesiveOpt, 43 /* + */, 22,0, TRUE} // 43
|
||||
, {doOpt, 255, 22,0, FALSE} // 44
|
||||
, {doNOP, 129, 45,0, TRUE} // 45 interval-open
|
||||
, {doIntervalMinValue, 128, 48,0, FALSE} // 46
|
||||
, {doNumberExpectedError, 255, 76,0, FALSE} // 47
|
||||
, {doNOP, 129, 52,0, TRUE} // 48 interval-value
|
||||
, {doNOP, 125 /* } */, 52,0, FALSE} // 49
|
||||
, {doIntervalDigit, 128, 48,0, TRUE} // 50
|
||||
, {doNumberExpectedError, 255, 76,0, FALSE} // 51
|
||||
, {doNOP, 129, 52,0, TRUE} // 52 interval-close
|
||||
, {doTagValue, 125 /* } */, 55,0, TRUE} // 53
|
||||
, {doNumberExpectedError, 255, 76,0, FALSE} // 54
|
||||
, {doNOP, 254, 3,0, FALSE} // 55 expr-cont-no-interval
|
||||
, {doExprOrOperator, 124 /* | */, 3,0, TRUE} // 56
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57
|
||||
, {doNOP, 255, 3,0, FALSE} // 58
|
||||
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 59 backslash
|
||||
, {doBackslashB, 66 /* B */, 3,0, TRUE} // 60
|
||||
, {doBackslashb, 98 /* b */, 3,0, TRUE} // 61
|
||||
, {doBackslashd, 100 /* d */, 18,0, TRUE} // 62
|
||||
, {doBackslashD, 68 /* D */, 18,0, TRUE} // 63
|
||||
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 64
|
||||
, {doProperty, 112 /* p */, 18,0, FALSE} // 65
|
||||
, {doProperty, 80 /* P */, 18,0, FALSE} // 66
|
||||
, {doEnterQuoteMode, 81 /* Q */, 3,0, TRUE} // 67
|
||||
, {doBackslashS, 83 /* S */, 18,0, TRUE} // 68
|
||||
, {doBackslashs, 115 /* s */, 18,0, TRUE} // 69
|
||||
, {doBackslashW, 87 /* W */, 18,0, TRUE} // 70
|
||||
, {doBackslashw, 119 /* w */, 18,0, TRUE} // 71
|
||||
, {doBackslashX, 88 /* X */, 18,0, TRUE} // 72
|
||||
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 73
|
||||
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 74
|
||||
, {doStartString, 255, 11,0, TRUE} // 75
|
||||
, {doExit, 255, 76,0, TRUE} // 76 errorDeath
|
||||
, {doStartString, 254, 13,0, TRUE} // 3 term
|
||||
, {doStartString, 130, 13,0, TRUE} // 4
|
||||
, {doScanUnicodeSet, 91 /* [ */, 20,0, TRUE} // 5
|
||||
, {doNOP, 40 /* ( */, 27, 20, TRUE} // 6
|
||||
, {doDotAny, 46 /* . */, 20,0, TRUE} // 7
|
||||
, {doCaret, 94 /* ^ */, 3,0, TRUE} // 8
|
||||
, {doDollar, 36 /* $ */, 3,0, TRUE} // 9
|
||||
, {doNOP, 92 /* \ */, 60,0, TRUE} // 10
|
||||
, {doNOP, 253, 2,0, FALSE} // 11
|
||||
, {doRuleError, 255, 80,0, FALSE} // 12
|
||||
, {doStringChar, 254, 13,0, TRUE} // 13 string
|
||||
, {doStringChar, 130, 13,0, TRUE} // 14
|
||||
, {doSplitString, 63 /* ? */, 20,0, FALSE} // 15
|
||||
, {doSplitString, 43 /* + */, 20,0, FALSE} // 16
|
||||
, {doSplitString, 42 /* * */, 20,0, FALSE} // 17
|
||||
, {doSplitString, 123 /* { */, 20,0, FALSE} // 18
|
||||
, {doEndString, 255, 20,0, FALSE} // 19
|
||||
, {doNOP, 42 /* * */, 41,0, TRUE} // 20 expr-quant
|
||||
, {doNOP, 43 /* + */, 44,0, TRUE} // 21
|
||||
, {doNOP, 63 /* ? */, 47,0, TRUE} // 22
|
||||
, {doNOP, 255, 24,0, FALSE} // 23
|
||||
, {doOrOperator, 124 /* | */, 3,0, TRUE} // 24 expr-cont
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 25
|
||||
, {doNOP, 255, 3,0, FALSE} // 26
|
||||
, {doNOP, 63 /* ? */, 29,0, TRUE} // 27 open-paren
|
||||
, {doOpenCaptureParen, 255, 3, 20, FALSE} // 28
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 29 open-paren-extended
|
||||
, {doOpenAtomicParen, 62 /* > */, 3, 20, TRUE} // 30
|
||||
, {doOpenLookAhead, 61 /* = */, 3, 24, TRUE} // 31
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 3, 24, TRUE} // 32
|
||||
, {doNOP, 60 /* < */, 36,0, TRUE} // 33
|
||||
, {doNOP, 35 /* # */, 39,0, TRUE} // 34
|
||||
, {doBadOpenParenType, 255, 80,0, FALSE} // 35
|
||||
, {doOpenLookBehind, 61 /* = */, 3, 24, TRUE} // 36 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 3, 24, TRUE} // 37
|
||||
, {doBadOpenParenType, 255, 80,0, FALSE} // 38
|
||||
, {doNOP, 41 /* ) */, 3,0, TRUE} // 39 paren-comment
|
||||
, {doNOP, 255, 39,0, TRUE} // 40
|
||||
, {doNGStar, 63 /* ? */, 24,0, TRUE} // 41 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 24,0, TRUE} // 42
|
||||
, {doStar, 255, 24,0, FALSE} // 43
|
||||
, {doNGPlus, 63 /* ? */, 24,0, TRUE} // 44 quant-plus
|
||||
, {doPossesivePlus, 43 /* + */, 24,0, TRUE} // 45
|
||||
, {doPlus, 255, 24,0, FALSE} // 46
|
||||
, {doNGOpt, 63 /* ? */, 24,0, TRUE} // 47 quant-opt
|
||||
, {doPossesiveOpt, 43 /* + */, 24,0, TRUE} // 48
|
||||
, {doOpt, 255, 24,0, FALSE} // 49
|
||||
, {doNOP, 129, 50,0, TRUE} // 50 interval-open
|
||||
, {doIntervalMinValue, 128, 53,0, FALSE} // 51
|
||||
, {doNumberExpectedError, 255, 80,0, FALSE} // 52
|
||||
, {doNOP, 129, 57,0, TRUE} // 53 interval-value
|
||||
, {doNOP, 125 /* } */, 57,0, FALSE} // 54
|
||||
, {doIntervalDigit, 128, 53,0, TRUE} // 55
|
||||
, {doNumberExpectedError, 255, 80,0, FALSE} // 56
|
||||
, {doNOP, 129, 57,0, TRUE} // 57 interval-close
|
||||
, {doTagValue, 125 /* } */, 24,0, TRUE} // 58
|
||||
, {doNumberExpectedError, 255, 80,0, FALSE} // 59
|
||||
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 60 backslash
|
||||
, {doBackslashB, 66 /* B */, 3,0, TRUE} // 61
|
||||
, {doBackslashb, 98 /* b */, 3,0, TRUE} // 62
|
||||
, {doBackslashd, 100 /* d */, 20,0, TRUE} // 63
|
||||
, {doBackslashD, 68 /* D */, 20,0, TRUE} // 64
|
||||
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 65
|
||||
, {doNamedChar, 78 /* N */, 20,0, TRUE} // 66
|
||||
, {doProperty, 112 /* p */, 20,0, FALSE} // 67
|
||||
, {doProperty, 80 /* P */, 20,0, FALSE} // 68
|
||||
, {doEnterQuoteMode, 81 /* Q */, 3,0, TRUE} // 69
|
||||
, {doBackslashS, 83 /* S */, 20,0, TRUE} // 70
|
||||
, {doBackslashs, 115 /* s */, 20,0, TRUE} // 71
|
||||
, {doBackslashW, 87 /* W */, 20,0, TRUE} // 72
|
||||
, {doBackslashw, 119 /* w */, 20,0, TRUE} // 73
|
||||
, {doBackslashX, 88 /* X */, 20,0, TRUE} // 74
|
||||
, {doBackslashx, 120 /* x */, 20,0, TRUE} // 75
|
||||
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 76
|
||||
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 77
|
||||
, {doBackRef, 128, 20,0, TRUE} // 78
|
||||
, {doStartString, 255, 13,0, TRUE} // 79
|
||||
, {doExit, 255, 80,0, TRUE} // 80 errorDeath
|
||||
};
|
||||
static const char *RegexStateNames[] = { 0,
|
||||
"start",
|
||||
|
@ -179,6 +186,8 @@ static const char *RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"string",
|
||||
0,
|
||||
|
@ -201,9 +210,12 @@ static const char *RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"open-paren-lookbehind",
|
||||
0,
|
||||
0,
|
||||
"paren-comment",
|
||||
0,
|
||||
"quant-star",
|
||||
0,
|
||||
|
@ -223,10 +235,6 @@ static const char *RegexStateNames[] = { 0,
|
|||
0,
|
||||
"interval-close",
|
||||
0,
|
||||
0,
|
||||
"expr-cont-no-interval",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"backslash",
|
||||
0,
|
||||
|
@ -244,6 +252,9 @@ static const char *RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"errorDeath",
|
||||
0};
|
||||
|
|
|
@ -77,6 +77,8 @@ term:
|
|||
'[' n expr-quant doScanUnicodeSet
|
||||
'(' n open-paren ^expr-quant
|
||||
'.' n expr-quant doDotAny
|
||||
'^' n term doCaret
|
||||
'$' n term doDollar
|
||||
'\' n backslash
|
||||
eof finish
|
||||
default errorDeath doRuleError
|
||||
|
@ -133,6 +135,7 @@ open-paren-extended:
|
|||
'=' n term ^expr-cont doOpenLookAhead # (?=
|
||||
'!' n term ^expr-cont doOpenLookAheadNeg # (?!
|
||||
'<' n open-paren-lookbehind
|
||||
'#' n paren-comment
|
||||
default errorDeath doBadOpenParenType
|
||||
|
||||
open-paren-lookbehind:
|
||||
|
@ -141,6 +144,15 @@ open-paren-lookbehind:
|
|||
default errorDeath doBadOpenParenType
|
||||
|
||||
|
||||
#
|
||||
# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
|
||||
# TODO: should parens nest here? Check what perl does.
|
||||
#
|
||||
paren-comment:
|
||||
')' n term
|
||||
default n paren-comment
|
||||
|
||||
|
||||
#
|
||||
# quant-star Scanning a '*' quantifier. Need to look ahead to decide
|
||||
# between plain '*', '*?', '*+'
|
||||
|
@ -188,23 +200,11 @@ interval-value:
|
|||
|
||||
interval-close:
|
||||
white_space n interval-close
|
||||
'}' n expr-cont-no-interval doTagValue
|
||||
'}' n expr-cont doTagValue
|
||||
default errorDeath doNumberExpectedError
|
||||
|
||||
|
||||
|
||||
#
|
||||
# expr-cont-no-tag Expression, continuation. At a point where additional terms are
|
||||
# allowed, but not required. Just like
|
||||
# expr-cont, above, except that no interval
|
||||
# specification {min, max} is permitted.
|
||||
#
|
||||
expr-cont-no-interval:
|
||||
quoted term
|
||||
'|' n term doExprOrOperator
|
||||
')' n pop doExprRParen
|
||||
default term
|
||||
|
||||
|
||||
#
|
||||
# backslash # Backslash. Figure out which of the \thingies we have encountered.
|
||||
|
@ -217,6 +217,7 @@ backslash:
|
|||
'd' n expr-quant doBackslashd
|
||||
'D' n expr-quant doBackslashD
|
||||
'G' n term doBackslashG
|
||||
'N' n expr-quant doNamedChar # \N{NAME} named char
|
||||
'p' expr-quant doProperty # \p{Lu} style property
|
||||
'P' expr-quant doProperty
|
||||
'Q' n term doEnterQuoteMode
|
||||
|
@ -225,8 +226,10 @@ backslash:
|
|||
'W' n expr-quant doBackslashW
|
||||
'w' n expr-quant doBackslashw
|
||||
'X' n expr-quant doBackslashX
|
||||
'x' n expr-quant doBackslashx
|
||||
'Z' n term doBackslashZ
|
||||
'z' n term doBackslashz
|
||||
digit_char n expr-quant doBackRef
|
||||
|
||||
default n string doStartString
|
||||
|
||||
|
|
|
@ -35,13 +35,15 @@ static const uint32_t URX_FAIL = 14; // Stop match operation; No
|
|||
|
||||
static const uint32_t URX_BACKSLASH_A = 15;
|
||||
static const uint32_t URX_BACKSLASH_B = 16; // Value field: 0: \b 1: \B
|
||||
static const uint32_t URX_BACKSLASH_D = 22; // Value field: 0: \d 1: \D
|
||||
static const uint32_t URX_BACKSLASH_G = 17;
|
||||
static const uint32_t URX_BACKSLASH_W = 18; // Value field: 0: \w 1: \W
|
||||
static const uint32_t URX_BACKSLASH_X = 19;
|
||||
static const uint32_t URX_BACKSLASH_Z = 20; // Value field: 0: \z 1: \Z
|
||||
static const uint32_t URX_BACKSLASH_Z = 20; // \z Unconditional end of line.
|
||||
|
||||
static const uint32_t URX_DOTANY_ALL = 21; // ., in the . matches any mode.
|
||||
static const uint32_t URX_BACKSLASH_D = 22; // Value field: 0: \d 1: \D
|
||||
static const uint32_t URX_CARET = 23; // Value field: 1: multi-line mode.
|
||||
static const uint32_t URX_DOLLAR = 24; // Also for \Z
|
||||
|
||||
|
||||
//
|
||||
|
|
|
@ -560,9 +560,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
// Cache frequently referenced items from the compiled pattern
|
||||
// in local variables.
|
||||
//
|
||||
UVector *pat = fPattern->fCompiledPat;
|
||||
const UnicodeString *litText = &fPattern->fLiteralText;
|
||||
UVector *sets = fPattern->fSets;
|
||||
UVector *pat = fPattern->fCompiledPat;
|
||||
const UnicodeString *litText = &fPattern->fLiteralText;
|
||||
UVector *sets = fPattern->fSets;
|
||||
int32_t inputLen = fInput->length();
|
||||
|
||||
|
||||
//
|
||||
|
@ -658,6 +659,46 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
fCaptureEnds->setElementAt(inputIdx, opValue);
|
||||
break;
|
||||
|
||||
|
||||
case URX_DOLLAR: // $, test for End of line
|
||||
// or for position before new line at end of input
|
||||
if (inputIdx < inputLen-2) {
|
||||
// We are no where near the end of input. Fail.
|
||||
backTrack(inputIdx, patIdx);
|
||||
break;
|
||||
}
|
||||
if (inputIdx >= inputLen) {
|
||||
// We really are at the end of input. Success.
|
||||
break;
|
||||
}
|
||||
// If we are positioned just before a new-line that is located at the
|
||||
// end of input, succeed.
|
||||
if (inputIdx == inputLen-1) {
|
||||
UChar32 c = fInput->char32At(inputIdx);
|
||||
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
|
||||
break; // At new-line at end of input. Success
|
||||
}
|
||||
}
|
||||
|
||||
if (inputIdx == inputLen-2) {
|
||||
if (fInput->char32At(inputIdx) == 0x0d && fInput->char32At(inputIdx+1) == 0x0a) {
|
||||
break; // At CR/LF at end of input. Success
|
||||
}
|
||||
}
|
||||
|
||||
backTrack(inputIdx, patIdx);
|
||||
|
||||
// TODO: support for multi-line mode.
|
||||
break;
|
||||
|
||||
|
||||
case URX_CARET: // ^, test for start of line
|
||||
if (inputIdx != 0) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
} // TODO: support for multi-line mode.
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_A: // Test for start of input
|
||||
if (inputIdx != 0) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
|
@ -731,18 +772,24 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
for(;;) {
|
||||
c = fInput->char32At(inputIdx);
|
||||
ctype = u_charType(c);
|
||||
// TODO: make a set and add the "othe grapheme extend" chars
|
||||
// TODO: make a set and add the "other grapheme extend" chars
|
||||
// to the list of stuff to be skipped over.
|
||||
if (!(ctype == U_NON_SPACING_MARK || ctype == U_ENCLOSING_MARK)) {
|
||||
break;
|
||||
}
|
||||
inputIdx = fInput->moveIndex32(inputIdx, 1);
|
||||
if (inputIdx >= fInputLength) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case URX_BACKSLASH_Z: // Test for end of line
|
||||
if (FALSE) {
|
||||
if (inputIdx < inputLen) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
}
|
||||
break;
|
||||
|
|
|
@ -421,7 +421,11 @@ static char *opNames[] = {
|
|||
"URX_BACKSLASH_G",
|
||||
"URX_BACKSLASH_W",
|
||||
"URX_BACKSLASH_X",
|
||||
"URX_BACKSLASH_Z"
|
||||
"URX_BACKSLASH_Z",
|
||||
"URX_DOTANY_ALL",
|
||||
"URX_BACKSLASH_D",
|
||||
"URX_CARET",
|
||||
"URX_DOLLAR"
|
||||
};
|
||||
|
||||
void RegexPattern::dump() {
|
||||
|
@ -470,8 +474,11 @@ void RegexPattern::dump() {
|
|||
case URX_STATE_SAVE:
|
||||
case URX_JMP:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_D:
|
||||
case URX_BACKSLASH_W:
|
||||
case URX_BACKSLASH_Z:
|
||||
case URX_CARET:
|
||||
case URX_DOLLAR:
|
||||
// types with an integer operand field.
|
||||
printf("%d", val);
|
||||
break;
|
||||
|
|
|
@ -54,6 +54,10 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
|
|||
case 4: name = "Extended";
|
||||
if (exec) Extended();
|
||||
break;
|
||||
case 5: name = "Errors";
|
||||
if (exec) Errors();
|
||||
break;
|
||||
|
||||
default: name = "";
|
||||
break; //needed to end loop
|
||||
}
|
||||
|
@ -163,7 +167,7 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
|
|||
//
|
||||
// usage:
|
||||
// REGEX_FIND("pattern", "input text");
|
||||
// REGEX_FIND_S("pattern", "input text", expected status);
|
||||
// REGEX_ERR("pattern", expected status);
|
||||
//
|
||||
// The input text is unescaped. The pattern is not.
|
||||
// The input text is marked with the expected match positions
|
||||
|
@ -177,7 +181,6 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
|
|||
// REGEX_FIND is invoked via a macro, which allows capturing the source file line
|
||||
// number for use in error messages.
|
||||
#define REGEX_FIND(pat, text) regex_find(pat, text, U_ZERO_ERROR, __LINE__);
|
||||
#define REGEX_FIND_S(pat, text, status) regex_find(pat, text, status, __LINE__);
|
||||
|
||||
|
||||
// Set a value into a UVector at position specified by a decimal number in
|
||||
|
@ -301,6 +304,52 @@ cleanupAndReturn:
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// REGEX_ERR Macro + invocation function to simplify writing tests
|
||||
// regex tests for incorrect patterns
|
||||
//
|
||||
// usage:
|
||||
// REGEX_ERR("pattern", expected error line, column, expected status);
|
||||
//
|
||||
//---------------------------------------------------------------------------
|
||||
#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
|
||||
|
||||
void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
|
||||
UErrorCode expectedStatus, int line) {
|
||||
UnicodeString pattern(pat);
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UParseError pe;
|
||||
RegexPattern *callerPattern = NULL;
|
||||
|
||||
//
|
||||
// Compile the caller's pattern
|
||||
//
|
||||
UnicodeString patString(pat);
|
||||
callerPattern = RegexPattern::compile(patString, 0, pe, status);
|
||||
if (status != expectedStatus) {
|
||||
errln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
|
||||
} else {
|
||||
if (status != U_ZERO_ERROR) {
|
||||
if (pe.line != errLine || pe.offset != errCol) {
|
||||
errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
|
||||
line, errLine, errCol, pe.line, pe.offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete callerPattern;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// Basic Check for basic functionality of regex pattern matching.
|
||||
|
@ -429,8 +478,8 @@ void RegexTest::Basic() {
|
|||
|
||||
// REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
|
||||
REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
|
||||
// REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L (or whatever) TODO: bug in Unescape
|
||||
// REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape TODO: bug in Unescape
|
||||
REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
|
||||
REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
|
||||
REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
|
||||
REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
|
||||
REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
|
||||
|
@ -1087,7 +1136,66 @@ void RegexTest::Extended() {
|
|||
|
||||
// \X consume one combining char sequence.
|
||||
REGEX_FIND("(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?",
|
||||
"<0><1>A</1><2>B</2><3> </3></0>");
|
||||
"<0><1>A</1><2>B</2><3> </3><4>\\r\\n</4></0>");
|
||||
REGEX_FIND("(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?",
|
||||
"<0><1>A\\u0301</1><2>\n</2><3>\\u0305</3><4>a\\u0302\\u0303\\u0304</4></0>");
|
||||
|
||||
// ^ matches only at beginning of line
|
||||
REGEX_FIND(".*^(Hello)", "<0><1>Hello</1></0> Hello Hello Hello Goodbye");
|
||||
REGEX_FIND(".*(Hello)", "<0>Hello Hello Hello <1>Hello</1></0> Goodbye");
|
||||
REGEX_FIND(".*^(Hello)", " Hello Hello Hello Hello Goodbye"); // No Match
|
||||
|
||||
// $ matches only at end of line, or before a newline preceding the end of line
|
||||
REGEX_FIND(".*?(Goodbye)$", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>");
|
||||
REGEX_FIND(".*?(Goodbye)", "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye");
|
||||
REGEX_FIND(".*?(Goodbye)$", "Hello Goodbye> Goodbye Goodbye "); // No Match
|
||||
|
||||
REGEX_FIND(".*?(Goodbye)$", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\n");
|
||||
REGEX_FIND(".*?(Goodbye)$", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\n");
|
||||
REGEX_FIND(".*?(Goodbye)$", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\r\\n");
|
||||
REGEX_FIND(".*?(Goodbye)$", "Hello Goodbye Goodbye Goodbye\\n\\n"); // No Match
|
||||
|
||||
// \Z matches at end of input, like $ with default flags.
|
||||
REGEX_FIND(".*?(Goodbye)\\Z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>");
|
||||
REGEX_FIND(".*?(Goodbye)", "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye");
|
||||
REGEX_FIND(".*?(Goodbye)\\Z", "Hello Goodbye> Goodbye Goodbye "); // No Match
|
||||
REGEX_FIND("here$", "here\\nthe end"); // No Match
|
||||
|
||||
REGEX_FIND(".*?(Goodbye)\\Z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\n");
|
||||
REGEX_FIND(".*?(Goodbye)\\Z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\n");
|
||||
REGEX_FIND(".*?(Goodbye)\\Z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\r\\n");
|
||||
REGEX_FIND(".*?(Goodbye)\\Z", "Hello Goodbye Goodbye Goodbye\\n\\n"); // No Match
|
||||
|
||||
// \z matches only at the end of string.
|
||||
// no special treatment of new lines.
|
||||
// no dependencies on flag settings.
|
||||
REGEX_FIND(".*?(Goodbye)\\z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>");
|
||||
REGEX_FIND(".*?(Goodbye)\\z", "Hello Goodbye Goodbye Goodbye "); // No Match
|
||||
REGEX_FIND("here$", "here\\nthe end"); // No Match
|
||||
|
||||
REGEX_FIND(".*?(Goodbye)\\z", "Hello Goodbye Goodbye Goodbye\\n"); // No Match
|
||||
REGEX_FIND(".*?(Goodbye)\\n\\z", "<0>Hello Goodbye Goodbye <1>Goodbye</1>\\n</0>");
|
||||
|
||||
// (?# comment) doesn't muck up pattern
|
||||
REGEX_FIND("Hello (?# this is a comment) world", " <0>Hello world</0>...");
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// Errors Check for error handling in patterns.
|
||||
//
|
||||
//---------------------------------------------------------------------------
|
||||
void RegexTest::Errors() {
|
||||
// \escape sequences that aren't implemented yet.
|
||||
REGEX_ERR("No (support) for \\1 BackReferences yet.", 1, 19, U_REGEX_UNIMPLEMENTED);
|
||||
REGEX_ERR("named chars \\N{GREEK CAPITAL LETTER ALPHA} not implementd", 1, 14, U_REGEX_UNIMPLEMENTED);
|
||||
REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
|
||||
|
||||
// Missing close parentheses
|
||||
//REGEX_ERR("Comment (?# with no close", 1, 0, U_REGEX_INTERNAL_ERROR);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -21,13 +21,18 @@ public:
|
|||
|
||||
virtual void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL );
|
||||
|
||||
// The following are test functions that are visible from the intltest test framework.
|
||||
virtual void API_Match();
|
||||
virtual void API_Pattern();
|
||||
virtual void API_Replace();
|
||||
virtual void Basic();
|
||||
virtual void Extended();
|
||||
virtual void Errors();
|
||||
|
||||
// The following functions are internal to the regexp tests.
|
||||
virtual UBool doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int line);
|
||||
virtual void regex_find(const char *pat, const char *input, UErrorCode expectedStatus, int line);
|
||||
virtual void regex_err(const char *pat, int32_t errline, int32_t errcol,
|
||||
UErrorCode expectedStatus, int line);
|
||||
};
|
||||
#endif
|
||||
|
|
Loading…
Add table
Reference in a new issue