ICU-105 Regular Expressions, ongoing development

X-SVN-Rev: 10135
This commit is contained in:
Andy Heninger 2002-10-31 23:01:54 +00:00
parent e882b8d7c5
commit 9600c27c58
6 changed files with 121 additions and 17 deletions

View file

@ -91,6 +91,10 @@ static const UChar gIsWordPattern[] = {
// \ p { N d } ]
0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d, 0x5d, 0};
static const UChar gIsSpacePattern[] = {
// [ \ t \ n \ f \ r \ p { Z } ]
0x5b, 0x5c, 0x74, 0x5c, 0x6e, 0x5c, 0x66, 0x5c, 0x72, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5d, 0};
static const UnicodeSet *gPropSets[URX_LAST_SET];
//----------------------------------------------------------------------------------------
@ -128,6 +132,8 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
gRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(gRuleSet_digit_char_pattern, status);
gUnescapeCharSet = new UnicodeSet(gUnescapeCharPattern, status);
gPropSets[URX_ISWORD_SET] = new UnicodeSet(gIsWordPattern, status);
gPropSets[URX_ISSPACE_SET] = new UnicodeSet(gIsSpacePattern, status);
if (U_FAILURE(status)) {
delete gRuleSets[kRuleSet_rule_char-128];
delete gRuleSets[kRuleSet_white_space-128];
@ -721,6 +727,16 @@ UBool RegexCompile::doParseActions(EParseAction action)
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
break;
case doBackslashS:
fRXPat->fCompiledPat->addElement(
URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET | URX_NEG_SET), *fStatus);
break;
case doBackslashs:
fRXPat->fCompiledPat->addElement(
URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
break;
case doBackslashW:
fRXPat->fCompiledPat->addElement(
URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus);
@ -761,6 +777,11 @@ UBool RegexCompile::doParseActions(EParseAction action)
compileSet(theSet);
}
break;
case doEnterQuoteMode:
// Just scanned a \Q. Put character scanner into quote mode.
fQuoteMode = TRUE;
break;
default:
error(U_BRK_INTERNAL_ERROR);
@ -954,6 +975,7 @@ static const UChar chNEL = 0x85; // NEL newline variant
static const UChar chLS = 0x2028; // Unicode Line Separator
static const UChar chApos = 0x27; // single quote, for quoted chars.
static const UChar chPound = 0x23; // '#', introduces a comment.
static const UChar chE = 0x45; // 'E'
static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
static const UChar chLParen = 0x28;
static const UChar chRParen = 0x29;
@ -1042,6 +1064,11 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
if (fQuoteMode) {
c.fQuoted = TRUE;
if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) {
fQuoteMode = FALSE; // Exit quote mode,
nextCharLL(); // discard the E
nextChar(c); // recurse to get the real next char
}
}
else
{

View file

@ -30,6 +30,7 @@ enum Regex_PatternParseAction {
doOpenCaptureParen,
doBadOpenParenType,
doRuleError,
doBackslashs,
doStartString,
doNGOpt,
doBackslashw,
@ -38,6 +39,7 @@ enum Regex_PatternParseAction {
doExprRParen,
doBackslashz,
doStar,
doEnterQuoteMode,
doPossesivePlus,
doNGStar,
doOpenLookAheadNeg,
@ -54,6 +56,7 @@ enum Regex_PatternParseAction {
doBackslashG,
doOpt,
doOpenAtomicParen,
doBackslashS,
doStringChar,
doOpenLookAhead,
doNumberExpectedError,
@ -98,7 +101,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doDotAny, 46 /* . */, 18,0, TRUE} // 7
, {doNOP, 92 /* \ */, 59,0, TRUE} // 8
, {doNOP, 253, 2,0, FALSE} // 9
, {doRuleError, 255, 73,0, FALSE} // 10
, {doRuleError, 255, 76,0, FALSE} // 10
, {doStringChar, 254, 11,0, TRUE} // 11 string
, {doStringChar, 130, 11,0, TRUE} // 12
, {doSplitString, 63 /* ? */, 18,0, FALSE} // 13
@ -120,10 +123,10 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOpenLookAhead, 61 /* = */, 3, 22, TRUE} // 29
, {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE} // 30
, {doNOP, 60 /* < */, 33,0, TRUE} // 31
, {doBadOpenParenType, 255, 73,0, FALSE} // 32
, {doBadOpenParenType, 255, 76,0, FALSE} // 32
, {doOpenLookBehind, 61 /* = */, 3, 22, TRUE} // 33 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE} // 34
, {doBadOpenParenType, 255, 73,0, FALSE} // 35
, {doBadOpenParenType, 255, 76,0, FALSE} // 35
, {doNGStar, 63 /* ? */, 22,0, TRUE} // 36 quant-star
, {doPossesiveStar, 43 /* + */, 22,0, TRUE} // 37
, {doStar, 255, 22,0, FALSE} // 38
@ -135,14 +138,14 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOpt, 255, 22,0, FALSE} // 44
, {doNOP, 129, 45,0, TRUE} // 45 interval-open
, {doIntervalMinValue, 128, 48,0, FALSE} // 46
, {doNumberExpectedError, 255, 73,0, FALSE} // 47
, {doNumberExpectedError, 255, 76,0, FALSE} // 47
, {doNOP, 129, 52,0, TRUE} // 48 interval-value
, {doNOP, 125 /* } */, 52,0, FALSE} // 49
, {doIntervalDigit, 128, 48,0, TRUE} // 50
, {doNumberExpectedError, 255, 73,0, FALSE} // 51
, {doNumberExpectedError, 255, 76,0, FALSE} // 51
, {doNOP, 129, 52,0, TRUE} // 52 interval-close
, {doTagValue, 125 /* } */, 55,0, TRUE} // 53
, {doNumberExpectedError, 255, 73,0, FALSE} // 54
, {doNumberExpectedError, 255, 76,0, FALSE} // 54
, {doNOP, 254, 3,0, FALSE} // 55 expr-cont-no-interval
, {doExprOrOperator, 124 /* | */, 3,0, TRUE} // 56
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57
@ -155,13 +158,16 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 64
, {doProperty, 112 /* p */, 18,0, FALSE} // 65
, {doProperty, 80 /* P */, 18,0, FALSE} // 66
, {doBackslashW, 87 /* W */, 18,0, TRUE} // 67
, {doBackslashw, 119 /* w */, 18,0, TRUE} // 68
, {doBackslashX, 88 /* X */, 18,0, TRUE} // 69
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 70
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 71
, {doStartString, 255, 11,0, TRUE} // 72
, {doExit, 255, 73,0, TRUE} // 73 errorDeath
, {doEnterQuoteMode, 81 /* Q */, 3,0, TRUE} // 67
, {doBackslashS, 83 /* S */, 18,0, TRUE} // 68
, {doBackslashs, 115 /* s */, 18,0, TRUE} // 69
, {doBackslashW, 87 /* W */, 18,0, TRUE} // 70
, {doBackslashw, 119 /* w */, 18,0, TRUE} // 71
, {doBackslashX, 88 /* X */, 18,0, TRUE} // 72
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 73
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 74
, {doStartString, 255, 11,0, TRUE} // 75
, {doExit, 255, 76,0, TRUE} // 76 errorDeath
};
static const char *RegexStateNames[] = { 0,
"start",
@ -235,6 +241,9 @@ static const char *RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
0,
0,
"errorDeath",
0};

View file

@ -219,6 +219,9 @@ backslash:
'G' n term doBackslashG
'p' expr-quant doProperty # \p{Lu} style property
'P' expr-quant doProperty
'Q' n term doEnterQuoteMode
'S' n expr-quant doBackslashS
's' n expr-quant doBackslashs
'W' n expr-quant doBackslashW
'w' n expr-quant doBackslashw
'X' n expr-quant doBackslashX

View file

@ -59,7 +59,8 @@ static const uint32_t URX_DOTANY_ALL = 21; // ., in the . matches any m
static const uint32_t URX_ISWORD_SET = 1;
static const uint32_t URX_ISALNUM_SET = 2;
static const uint32_t URX_ISALPHA_SET = 3;
static const uint32_t URX_LAST_SET = 4;
static const uint32_t URX_ISSPACE_SET = 4;
static const uint32_t URX_LAST_SET = 5;
static const uint32_t URX_NEG_SET = 0x800000; // Flag bit to reverse sense of set
// membership test.

View file

@ -220,7 +220,18 @@ UBool RegexMatcher::find() {
// Start at the position of the last match end. (Will be zero if the
// matcher has been reset.
UErrorCode status = U_ZERO_ERROR;
return find(fMatchEnd, status);
int32_t startPos;
for (startPos=fMatchEnd; startPos < fInputLength; startPos++) {
MatchAt(startPos, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
}
return FALSE;
}
@ -233,6 +244,7 @@ UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
this->reset();
// TODO: optimize a search for the first char of a possible match.
// TODO: optimize the search for a leading literal string.
@ -378,7 +390,7 @@ RegexMatcher &RegexMatcher::reset() {
fMatch = FALSE;
int i;
for (i=0; i<=fPattern->fNumCaptureGroups; i++) {
fCaptureStarts->setElementAt(i, -1);
fCaptureStarts->setElementAt(-1, i);
}
return *this;
@ -537,6 +549,14 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
return;
}
// Clear out capture results from any previous match.
// Needed to clear capture groups in patterns with | operations that may not match at all,
// although the pattern as a whole does match.
int i;
for (i=0; i<=fPattern->fNumCaptureGroups; i++) {
fCaptureStarts->setElementAt(-1, i);
}
// Cache frequently referenced items from the compiled pattern
// in local variables.
//
@ -678,7 +698,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
case URX_BACKSLASH_G: // Test for position at end of previous match
if (FALSE) {
if (!((fMatch && inputIdx==fMatchEnd) || fMatch==FALSE && inputIdx==0)) {
backTrack(inputIdx, patIdx);
}
break;

View file

@ -618,6 +618,38 @@ void RegexTest::API_Match() {
delete pat;
}
//
// find, with \G in pattern (true if at the end of a previous match).
//
{
int32_t flags=0;
UParseError pe;
UErrorCode status=U_ZERO_ERROR;
UnicodeString re(".*?(?:(\\Gabc)|(abc))");
RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
REGEX_CHECK_STATUS;
UnicodeString data = ".abcabc.abc..";
// 012345678901234567
RegexMatcher *matcher = pat->matcher(data, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher->find());
REGEX_ASSERT(matcher->start(status) == 0);
REGEX_ASSERT(matcher->start(1, status) == -1);
REGEX_ASSERT(matcher->start(2, status) == 1);
REGEX_ASSERT(matcher->find());
REGEX_ASSERT(matcher->start(status) == 4);
REGEX_ASSERT(matcher->start(1, status) == 4);
REGEX_ASSERT(matcher->start(2, status) == -1);
REGEX_CHECK_STATUS;
delete matcher;
delete pat;
}
//
// Replace
//
@ -1016,6 +1048,11 @@ void RegexTest::Extended() {
REGEX_FIND( "\\w+", " $%^&*( <0>hello123</0>%^&*(");
REGEX_FIND( "\\W+", "<0> $%^&*( </0>hello123%^&*(");
// \A match at beginning of input only.
REGEX_FIND (".*\\Ahello", "<0>hello</0> hello");
REGEX_FIND (".*hello", "<0>hello hello</0>");
REGEX_FIND(".*\\Ahello", "stuff\nhello"); // don't match after embedded new-line.
// \b \B
REGEX_FIND( ".*?\\b(.).*", "<0> $%^&*( <1>h</1>ello123%^&*()gxx</0>");
@ -1040,6 +1077,13 @@ void RegexTest::Extended() {
REGEX_FIND("\\D+", "<0>non digits</0>");
REGEX_FIND("\\D*(\\d*)(\\D*)", "<0>non-digits<1>3456666</1><2>more non digits</2></0>");
// \Q...\E quote mode
REGEX_FIND("hel\\Qlo, worl\\Ed", "<0>hello, world</0>");
REGEX_FIND("\\Q$*^^(*)?\\A\\E(a*)", "<0>$*^^(*)?\\\\A<1>aaaaaaaaaaaaaaa</1></0>");
// \S and \s space characters
REGEX_FIND("\\s+", "not_space<0> \\t \\r \\n \\u3000 \\u2004 \\u2028 \\u2029</0>xyz");
REGEX_FIND("(\\S+).*?(\\S+).*", "<0><1>Not-spaces</1> <2>more-non-spaces</2> </0>");
}