mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-09 15:27:38 +00:00
ICU-105 Regular Expressions, ongoing development
X-SVN-Rev: 10135
This commit is contained in:
parent
e882b8d7c5
commit
9600c27c58
6 changed files with 121 additions and 17 deletions
|
@ -91,6 +91,10 @@ static const UChar gIsWordPattern[] = {
|
|||
// \ p { N d } ]
|
||||
0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gIsSpacePattern[] = {
|
||||
// [ \ t \ n \ f \ r \ p { Z } ]
|
||||
0x5b, 0x5c, 0x74, 0x5c, 0x6e, 0x5c, 0x66, 0x5c, 0x72, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UnicodeSet *gPropSets[URX_LAST_SET];
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
|
@ -128,6 +132,8 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
|
|||
gRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(gRuleSet_digit_char_pattern, status);
|
||||
gUnescapeCharSet = new UnicodeSet(gUnescapeCharPattern, status);
|
||||
gPropSets[URX_ISWORD_SET] = new UnicodeSet(gIsWordPattern, status);
|
||||
gPropSets[URX_ISSPACE_SET] = new UnicodeSet(gIsSpacePattern, status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
delete gRuleSets[kRuleSet_rule_char-128];
|
||||
delete gRuleSets[kRuleSet_white_space-128];
|
||||
|
@ -721,6 +727,16 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashS:
|
||||
fRXPat->fCompiledPat->addElement(
|
||||
URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET | URX_NEG_SET), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashs:
|
||||
fRXPat->fCompiledPat->addElement(
|
||||
URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashW:
|
||||
fRXPat->fCompiledPat->addElement(
|
||||
URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus);
|
||||
|
@ -761,6 +777,11 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
compileSet(theSet);
|
||||
}
|
||||
break;
|
||||
|
||||
case doEnterQuoteMode:
|
||||
// Just scanned a \Q. Put character scanner into quote mode.
|
||||
fQuoteMode = TRUE;
|
||||
break;
|
||||
|
||||
default:
|
||||
error(U_BRK_INTERNAL_ERROR);
|
||||
|
@ -954,6 +975,7 @@ static const UChar chNEL = 0x85; // NEL newline variant
|
|||
static const UChar chLS = 0x2028; // Unicode Line Separator
|
||||
static const UChar chApos = 0x27; // single quote, for quoted chars.
|
||||
static const UChar chPound = 0x23; // '#', introduces a comment.
|
||||
static const UChar chE = 0x45; // 'E'
|
||||
static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
|
||||
static const UChar chLParen = 0x28;
|
||||
static const UChar chRParen = 0x29;
|
||||
|
@ -1042,6 +1064,11 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
|||
|
||||
if (fQuoteMode) {
|
||||
c.fQuoted = TRUE;
|
||||
if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) {
|
||||
fQuoteMode = FALSE; // Exit quote mode,
|
||||
nextCharLL(); // discard the E
|
||||
nextChar(c); // recurse to get the real next char
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -30,6 +30,7 @@ enum Regex_PatternParseAction {
|
|||
doOpenCaptureParen,
|
||||
doBadOpenParenType,
|
||||
doRuleError,
|
||||
doBackslashs,
|
||||
doStartString,
|
||||
doNGOpt,
|
||||
doBackslashw,
|
||||
|
@ -38,6 +39,7 @@ enum Regex_PatternParseAction {
|
|||
doExprRParen,
|
||||
doBackslashz,
|
||||
doStar,
|
||||
doEnterQuoteMode,
|
||||
doPossesivePlus,
|
||||
doNGStar,
|
||||
doOpenLookAheadNeg,
|
||||
|
@ -54,6 +56,7 @@ enum Regex_PatternParseAction {
|
|||
doBackslashG,
|
||||
doOpt,
|
||||
doOpenAtomicParen,
|
||||
doBackslashS,
|
||||
doStringChar,
|
||||
doOpenLookAhead,
|
||||
doNumberExpectedError,
|
||||
|
@ -98,7 +101,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doDotAny, 46 /* . */, 18,0, TRUE} // 7
|
||||
, {doNOP, 92 /* \ */, 59,0, TRUE} // 8
|
||||
, {doNOP, 253, 2,0, FALSE} // 9
|
||||
, {doRuleError, 255, 73,0, FALSE} // 10
|
||||
, {doRuleError, 255, 76,0, FALSE} // 10
|
||||
, {doStringChar, 254, 11,0, TRUE} // 11 string
|
||||
, {doStringChar, 130, 11,0, TRUE} // 12
|
||||
, {doSplitString, 63 /* ? */, 18,0, FALSE} // 13
|
||||
|
@ -120,10 +123,10 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doOpenLookAhead, 61 /* = */, 3, 22, TRUE} // 29
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE} // 30
|
||||
, {doNOP, 60 /* < */, 33,0, TRUE} // 31
|
||||
, {doBadOpenParenType, 255, 73,0, FALSE} // 32
|
||||
, {doBadOpenParenType, 255, 76,0, FALSE} // 32
|
||||
, {doOpenLookBehind, 61 /* = */, 3, 22, TRUE} // 33 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE} // 34
|
||||
, {doBadOpenParenType, 255, 73,0, FALSE} // 35
|
||||
, {doBadOpenParenType, 255, 76,0, FALSE} // 35
|
||||
, {doNGStar, 63 /* ? */, 22,0, TRUE} // 36 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 22,0, TRUE} // 37
|
||||
, {doStar, 255, 22,0, FALSE} // 38
|
||||
|
@ -135,14 +138,14 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doOpt, 255, 22,0, FALSE} // 44
|
||||
, {doNOP, 129, 45,0, TRUE} // 45 interval-open
|
||||
, {doIntervalMinValue, 128, 48,0, FALSE} // 46
|
||||
, {doNumberExpectedError, 255, 73,0, FALSE} // 47
|
||||
, {doNumberExpectedError, 255, 76,0, FALSE} // 47
|
||||
, {doNOP, 129, 52,0, TRUE} // 48 interval-value
|
||||
, {doNOP, 125 /* } */, 52,0, FALSE} // 49
|
||||
, {doIntervalDigit, 128, 48,0, TRUE} // 50
|
||||
, {doNumberExpectedError, 255, 73,0, FALSE} // 51
|
||||
, {doNumberExpectedError, 255, 76,0, FALSE} // 51
|
||||
, {doNOP, 129, 52,0, TRUE} // 52 interval-close
|
||||
, {doTagValue, 125 /* } */, 55,0, TRUE} // 53
|
||||
, {doNumberExpectedError, 255, 73,0, FALSE} // 54
|
||||
, {doNumberExpectedError, 255, 76,0, FALSE} // 54
|
||||
, {doNOP, 254, 3,0, FALSE} // 55 expr-cont-no-interval
|
||||
, {doExprOrOperator, 124 /* | */, 3,0, TRUE} // 56
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57
|
||||
|
@ -155,13 +158,16 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 64
|
||||
, {doProperty, 112 /* p */, 18,0, FALSE} // 65
|
||||
, {doProperty, 80 /* P */, 18,0, FALSE} // 66
|
||||
, {doBackslashW, 87 /* W */, 18,0, TRUE} // 67
|
||||
, {doBackslashw, 119 /* w */, 18,0, TRUE} // 68
|
||||
, {doBackslashX, 88 /* X */, 18,0, TRUE} // 69
|
||||
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 70
|
||||
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 71
|
||||
, {doStartString, 255, 11,0, TRUE} // 72
|
||||
, {doExit, 255, 73,0, TRUE} // 73 errorDeath
|
||||
, {doEnterQuoteMode, 81 /* Q */, 3,0, TRUE} // 67
|
||||
, {doBackslashS, 83 /* S */, 18,0, TRUE} // 68
|
||||
, {doBackslashs, 115 /* s */, 18,0, TRUE} // 69
|
||||
, {doBackslashW, 87 /* W */, 18,0, TRUE} // 70
|
||||
, {doBackslashw, 119 /* w */, 18,0, TRUE} // 71
|
||||
, {doBackslashX, 88 /* X */, 18,0, TRUE} // 72
|
||||
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 73
|
||||
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 74
|
||||
, {doStartString, 255, 11,0, TRUE} // 75
|
||||
, {doExit, 255, 76,0, TRUE} // 76 errorDeath
|
||||
};
|
||||
static const char *RegexStateNames[] = { 0,
|
||||
"start",
|
||||
|
@ -235,6 +241,9 @@ static const char *RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"errorDeath",
|
||||
0};
|
||||
|
|
|
@ -219,6 +219,9 @@ backslash:
|
|||
'G' n term doBackslashG
|
||||
'p' expr-quant doProperty # \p{Lu} style property
|
||||
'P' expr-quant doProperty
|
||||
'Q' n term doEnterQuoteMode
|
||||
'S' n expr-quant doBackslashS
|
||||
's' n expr-quant doBackslashs
|
||||
'W' n expr-quant doBackslashW
|
||||
'w' n expr-quant doBackslashw
|
||||
'X' n expr-quant doBackslashX
|
||||
|
|
|
@ -59,7 +59,8 @@ static const uint32_t URX_DOTANY_ALL = 21; // ., in the . matches any m
|
|||
static const uint32_t URX_ISWORD_SET = 1;
|
||||
static const uint32_t URX_ISALNUM_SET = 2;
|
||||
static const uint32_t URX_ISALPHA_SET = 3;
|
||||
static const uint32_t URX_LAST_SET = 4;
|
||||
static const uint32_t URX_ISSPACE_SET = 4;
|
||||
static const uint32_t URX_LAST_SET = 5;
|
||||
|
||||
static const uint32_t URX_NEG_SET = 0x800000; // Flag bit to reverse sense of set
|
||||
// membership test.
|
||||
|
|
|
@ -220,7 +220,18 @@ UBool RegexMatcher::find() {
|
|||
// Start at the position of the last match end. (Will be zero if the
|
||||
// matcher has been reset.
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
return find(fMatchEnd, status);
|
||||
|
||||
int32_t startPos;
|
||||
for (startPos=fMatchEnd; startPos < fInputLength; startPos++) {
|
||||
MatchAt(startPos, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return FALSE;
|
||||
}
|
||||
if (fMatch) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
|
@ -233,6 +244,7 @@ UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
|
|||
status = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
this->reset();
|
||||
|
||||
// TODO: optimize a search for the first char of a possible match.
|
||||
// TODO: optimize the search for a leading literal string.
|
||||
|
@ -378,7 +390,7 @@ RegexMatcher &RegexMatcher::reset() {
|
|||
fMatch = FALSE;
|
||||
int i;
|
||||
for (i=0; i<=fPattern->fNumCaptureGroups; i++) {
|
||||
fCaptureStarts->setElementAt(i, -1);
|
||||
fCaptureStarts->setElementAt(-1, i);
|
||||
}
|
||||
|
||||
return *this;
|
||||
|
@ -537,6 +549,14 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
return;
|
||||
}
|
||||
|
||||
// Clear out capture results from any previous match.
|
||||
// Needed to clear capture groups in patterns with | operations that may not match at all,
|
||||
// although the pattern as a whole does match.
|
||||
int i;
|
||||
for (i=0; i<=fPattern->fNumCaptureGroups; i++) {
|
||||
fCaptureStarts->setElementAt(-1, i);
|
||||
}
|
||||
|
||||
// Cache frequently referenced items from the compiled pattern
|
||||
// in local variables.
|
||||
//
|
||||
|
@ -678,7 +698,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
|
||||
|
||||
case URX_BACKSLASH_G: // Test for position at end of previous match
|
||||
if (FALSE) {
|
||||
if (!((fMatch && inputIdx==fMatchEnd) || fMatch==FALSE && inputIdx==0)) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
}
|
||||
break;
|
||||
|
|
|
@ -618,6 +618,38 @@ void RegexTest::API_Match() {
|
|||
delete pat;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// find, with \G in pattern (true if at the end of a previous match).
|
||||
//
|
||||
{
|
||||
int32_t flags=0;
|
||||
UParseError pe;
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
|
||||
UnicodeString re(".*?(?:(\\Gabc)|(abc))");
|
||||
RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
UnicodeString data = ".abcabc.abc..";
|
||||
// 012345678901234567
|
||||
|
||||
RegexMatcher *matcher = pat->matcher(data, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(matcher->find());
|
||||
REGEX_ASSERT(matcher->start(status) == 0);
|
||||
REGEX_ASSERT(matcher->start(1, status) == -1);
|
||||
REGEX_ASSERT(matcher->start(2, status) == 1);
|
||||
|
||||
REGEX_ASSERT(matcher->find());
|
||||
REGEX_ASSERT(matcher->start(status) == 4);
|
||||
REGEX_ASSERT(matcher->start(1, status) == 4);
|
||||
REGEX_ASSERT(matcher->start(2, status) == -1);
|
||||
REGEX_CHECK_STATUS;
|
||||
|
||||
delete matcher;
|
||||
delete pat;
|
||||
}
|
||||
|
||||
//
|
||||
// Replace
|
||||
//
|
||||
|
@ -1016,6 +1048,11 @@ void RegexTest::Extended() {
|
|||
REGEX_FIND( "\\w+", " $%^&*( <0>hello123</0>%^&*(");
|
||||
REGEX_FIND( "\\W+", "<0> $%^&*( </0>hello123%^&*(");
|
||||
|
||||
// \A match at beginning of input only.
|
||||
REGEX_FIND (".*\\Ahello", "<0>hello</0> hello");
|
||||
REGEX_FIND (".*hello", "<0>hello hello</0>");
|
||||
REGEX_FIND(".*\\Ahello", "stuff\nhello"); // don't match after embedded new-line.
|
||||
|
||||
// \b \B
|
||||
REGEX_FIND( ".*?\\b(.).*", "<0> $%^&*( <1>h</1>ello123%^&*()gxx</0>");
|
||||
|
||||
|
@ -1040,6 +1077,13 @@ void RegexTest::Extended() {
|
|||
REGEX_FIND("\\D+", "<0>non digits</0>");
|
||||
REGEX_FIND("\\D*(\\d*)(\\D*)", "<0>non-digits<1>3456666</1><2>more non digits</2></0>");
|
||||
|
||||
// \Q...\E quote mode
|
||||
REGEX_FIND("hel\\Qlo, worl\\Ed", "<0>hello, world</0>");
|
||||
REGEX_FIND("\\Q$*^^(*)?\\A\\E(a*)", "<0>$*^^(*)?\\\\A<1>aaaaaaaaaaaaaaa</1></0>");
|
||||
|
||||
// \S and \s space characters
|
||||
REGEX_FIND("\\s+", "not_space<0> \\t \\r \\n \\u3000 \\u2004 \\u2028 \\u2029</0>xyz");
|
||||
REGEX_FIND("(\\S+).*?(\\S+).*", "<0><1>Not-spaces</1> <2>more-non-spaces</2> </0>");
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue