ICU-2422 regexp, free-format support added, some match mode bugs fixed

X-SVN-Rev: 11035
This commit is contained in:
Andy Heninger 2003-02-13 01:10:22 +00:00
parent 22d336f746
commit 14da9b81e3
8 changed files with 118 additions and 52 deletions

View file

@ -175,20 +175,21 @@ static void caseClose(UnicodeSet *theSet) {
//----------------------------------------------------------------------------------------
RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status)
{
fStatus = &status;
fStatus = &status;
fRXPat = rxp;
fScanIndex = 0;
fNextIndex = 0;
fPeekChar = -1;
fLineNum = 1;
fCharNum = 0;
fQuoteMode = FALSE;
fFreeForm = FALSE;
fModeFlags = fRXPat->fFlags;
fRXPat = rxp;
fScanIndex = 0;
fNextIndex = 0;
fPeekChar = -1;
fLineNum = 1;
fCharNum = 0;
fQuoteMode = FALSE;
fInBackslashQuote = FALSE;
fModeFlags = fRXPat->fFlags;
fEOLComments = TRUE;
fMatchOpenParen = -1;
fMatchCloseParen = -1;
fMatchOpenParen = -1;
fMatchCloseParen = -1;
if (U_FAILURE(status)) {
return;
@ -304,7 +305,7 @@ void RegexCompile::compile(
U_ASSERT(state != 0);
// Find the state table element that matches the input char from the rule, or the
// Find the state table element that matches the input char from the pattern, or the
// class of the input character. Start with the first table row for this
// state, then linearly scan forward until we find a row that matches the
// character. The last row for each state always matches all characters, so
@ -337,7 +338,7 @@ void RegexCompile::compile(
}
if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
fC.fQuoted == FALSE && // char is not escaped &&
fC.fQuoted == FALSE && // char is not escaped &&
fC.fChar != (UChar32)-1) { // char is not EOF
UnicodeSet *uniset = gRuleSets[tableEl->fCharClass-128];
if (uniset->contains(fC.fChar)) {
@ -373,6 +374,10 @@ void RegexCompile::compile(
fStack[fStackPtr] = tableEl->fPushState;
}
//
// NextChar. This is where characters are actually fetched from the pattern.
// Happens under control of the 'n' tag in the state table.
//
if (tableEl->fNextChar) {
nextChar(fC);
}
@ -1300,9 +1305,17 @@ UBool RegexCompile::doParseActions(EParseAction action)
}
break;
case doSuppressComments:
// We have just scanned a '(?'. We now need to prevent the character scanner from
// treating a '#' as a to-the-end-of-line comment.
// (This Perl compatibility just gets uglier and uglier to do...)
fEOLComments = FALSE;
break;
default:
U_ASSERT(FALSE);
error(U_REGEX_INTERNAL_ERROR);
returnVal = FALSE;
break;
@ -1863,8 +1876,8 @@ static const UChar chUpperP = 0x50;
//----------------------------------------------------------------------------------------
//
// nextCharLL Low Level Next Char from the regex pattern.
// Get a char from the string,
// keep track of input position for error reporting.
// Get a char from the string, keep track of input position
// for error reporting.
//
//----------------------------------------------------------------------------------------
UChar32 RegexCompile::nextCharLL() {
@ -1929,9 +1942,6 @@ UChar32 RegexCompile::peekCharLL() {
//---------------------------------------------------------------------------------
void RegexCompile::nextChar(RegexPatternChar &c) {
// Unicode Character constants needed for the processing done by nextChar(),
// in hex because literals wont work on EBCDIC machines.
fScanIndex = fNextIndex;
c.fChar = nextCharLL();
c.fQuoted = FALSE;
@ -1944,39 +1954,60 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
nextChar(c); // recurse to get the real next char
}
}
else if (fInBackslashQuote) {
// The current character immediately follows a '\'
// Don't check for any further escapes, just return it as-is.
// Don't set c.fQuoted, because that would prevent the state machine from
// dispatching on the character.
fInBackslashQuote = FALSE;
}
else
{
// We are not in a 'quoted region' of the source.
// We are not in a \Q quoted region \E of the source.
//
if (fFreeForm && c.fChar == chPound) {
// Start of a comment. Consume the rest of it.
// The new-line char that terminates the comment is always returned.
// It will be treated as white-space, and serves to break up anything
// that might otherwise incorrectly clump together with a comment in
// the middle (a variable name, for example.)
if (fModeFlags & UREGEX_COMMENTS) {
//
// We are in free-spacing and comments mode.
// Scan through any white space and comments, until we
// reach a significant character or the end of inut.
for (;;) {
if (c.fChar == (UChar32)-1) {
break; // End of Input
}
if (c.fChar == chPound && fEOLComments == TRUE) {
// Start of a comment. Consume the rest of it, until EOF or a new line
for (;;) {
c.fChar = nextCharLL();
if (c.fChar == (UChar32)-1 || // EOF
c.fChar == chCR ||
c.fChar == chLF ||
c.fChar == chNEL ||
c.fChar == chLS) {
break;
}
}
}
if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) {
// TODO: is RuleWhiteSpace the right thing to use here?
break;
}
c.fChar = nextCharLL();
if (c.fChar == (UChar32)-1 || // EOF
c.fChar == chCR ||
c.fChar == chLF ||
c.fChar == chNEL ||
c.fChar == chLS) {break;}
}
}
if (c.fChar == (UChar32)-1) {
return;
}
//
// check for backslash escaped characters.
// Use UnicodeString::unescapeAt() to handle those that it can.
// Otherwise just return the '\', and let the pattern parser deal with it.
//
int32_t startX = fNextIndex; // start and end positions of the
int32_t endX = fNextIndex; // sequence following the '\'
int32_t startX = fNextIndex; // start and end positions of the
int32_t endX = fNextIndex; // sequence following the '\'
if (c.fChar == chBackSlash) {
if (gUnescapeCharSet->contains(peekCharLL())) {
nextCharLL(); // get & discard the peeked char.
//
// A '\' sequence that is handled by ICU's standard unescapeAt function.
// Includes \uxxxx, \n, \r, many others.
// Return the single equivalent character.
//
nextCharLL(); // get & discard the peeked char.
c.fQuoted = TRUE;
c.fChar = fRXPat->fPattern.unescapeAt(endX);
if (startX == endX) {
@ -1985,8 +2016,21 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
fCharNum += endX - startX;
fNextIndex = endX;
}
else
{
// We are in a '\' escape that will be handled by the state table scanner.
// Just return the backslash, but remember that the following char is to
// be taken literally. TODO: this is awkward
fInBackslashQuote = TRUE;
}
}
}
// re-enable # to end-of-line comments, in case they were disabled..
// They are disabled by the parser upon seeing '(?', but this lasts for
// the fetching of the next character only.
fEOLComments = TRUE;
// putc(c.fChar, stdout);
}

View file

@ -118,8 +118,10 @@ private:
// in the rule input string.
int32_t fNextIndex; // Index of the next character, which
// is the first character not yet scanned.
UBool fQuoteMode; // Scan is in a quoted region
UBool fFreeForm; // Scan mode is free-form, ignore spaces.
UBool fQuoteMode; // Scan is in a \Q...\E quoted region
UBool fInBackslashQuote; // Scan is between a '\' and the following char.
UBool fEOLComments; // When scan is just after '(?', inhibit #... to
// end of line comments, in favor of (?#...) comments.
int fLineNum; // Line number in input file.
int fCharNum; // Char position within the line.
UChar32 fLastChar; // Previous char, needed to count CR-LF

View file

@ -60,6 +60,7 @@ enum Regex_PatternParseAction {
doPossesiveOpt,
doEscapeError,
doBackslashG,
doSuppressComments,
doMatchModeParen,
doOpt,
doInterval,
@ -127,11 +128,11 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 21
, {doNOP, 255, 2,0, FALSE} // 22
, {doNOP, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant
, {doSuppressComments, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant
, {doNOP, 255, 27,0, FALSE} // 24
, {doNOP, 35 /* # */, 46, 14, TRUE} // 25 open-paren-quant2
, {doNOP, 255, 29,0, FALSE} // 26
, {doNOP, 63 /* ? */, 29,0, TRUE} // 27 open-paren
, {doSuppressComments, 63 /* ? */, 29,0, TRUE} // 27 open-paren
, {doOpenCaptureParen, 255, 2, 14, FALSE} // 28
, {doOpenNonCaptureParen, 58 /* : */, 2, 14, TRUE} // 29 open-paren-extended
, {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE} // 30

View file

@ -64,7 +64,7 @@ start:
# term. At a position where we can accept the start most items in a pattern.
#
term:
quoted n expr-quant doLiteralChar
quoted n expr-quant doLiteralChar
rule_char n expr-quant doLiteralChar
'[' n expr-quant doScanUnicodeSet
'(' n open-paren
@ -109,7 +109,7 @@ expr-cont:
# branches into the normal parenthesis sequence as quickly as possible.
#
open-paren-quant:
'?' n open-paren-quant2
'?' n open-paren-quant2 doSuppressComments
default open-paren
open-paren-quant2:
@ -122,7 +122,7 @@ open-paren-quant2:
# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
#
open-paren:
'?' n open-paren-extended
'?' n open-paren-extended doSuppressComments
default term ^expr-quant doOpenCaptureParen
open-paren-extended:

View file

@ -844,9 +844,11 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
// We are at the start input. Success.
break;
}
// Check the character just before the current pos.
// Check whether character just before the current pos is a new-line
// unless we are at the end of input
UChar c = inputBuf[fp->fInputIdx - 1];
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
if ((fp->fInputIdx < inputLen) &&
(c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
// It's a new-line. ^ is true. Success.
break;
}

View file

@ -215,7 +215,7 @@ RegexPattern *RegexPattern::compile(
return NULL;
}
if ((flags & (UREGEX_CANON_EQ | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE)) != 0) {
if ((flags & UREGEX_CANON_EQ) != 0) {
status = U_REGEX_UNIMPLEMENTED;
return NULL;
}

View file

@ -394,9 +394,12 @@ void RegexTest::Basic() {
//
#if 0
{
REGEX_TESTLM("(a)|\\1", "x", FALSE, FALSE);
// REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
// REGEX_TESTLM("^a (?#xxx) (?#yyy) {3}c", "accc", FALSE, FALSE);
UParseError pe;
UErrorCode status;
RegexPattern::compile("^a (?#xxx) (?#yyy) {3}c", UREGEX_COMMENTS, pe, status);
// REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
}
exit(1);
#endif

View file

@ -216,3 +216,17 @@
"(?:(?i)a)b" "<0>Ab</0>"
"ab(?i)cd" "<0>abCd</0>"
"ab$cd" "abcd"
# White space handling
"a b" "ab"
"abc " "abc"
"abc " "<0>abc </0>"
#"ab[cd e]z" "<0>ab z</0>" #TODO: white space handling in Unicode Sets.
#Multi-line mode
'b\s^' m "a\nb\n"
# Free-spacing mode
"a b c # this is a comment" x "<0>abc</0> "
'^a (?#xxx) (?#yyy) {3}c' x "<0>aaac</0>"