mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-2422 regexp, free-format support added, some match mode bugs fixed
X-SVN-Rev: 11035
This commit is contained in:
parent
22d336f746
commit
14da9b81e3
8 changed files with 118 additions and 52 deletions
icu4c/source
i18n
test
|
@ -175,20 +175,21 @@ static void caseClose(UnicodeSet *theSet) {
|
|||
//----------------------------------------------------------------------------------------
|
||||
RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status)
|
||||
{
|
||||
fStatus = &status;
|
||||
fStatus = &status;
|
||||
|
||||
fRXPat = rxp;
|
||||
fScanIndex = 0;
|
||||
fNextIndex = 0;
|
||||
fPeekChar = -1;
|
||||
fLineNum = 1;
|
||||
fCharNum = 0;
|
||||
fQuoteMode = FALSE;
|
||||
fFreeForm = FALSE;
|
||||
fModeFlags = fRXPat->fFlags;
|
||||
fRXPat = rxp;
|
||||
fScanIndex = 0;
|
||||
fNextIndex = 0;
|
||||
fPeekChar = -1;
|
||||
fLineNum = 1;
|
||||
fCharNum = 0;
|
||||
fQuoteMode = FALSE;
|
||||
fInBackslashQuote = FALSE;
|
||||
fModeFlags = fRXPat->fFlags;
|
||||
fEOLComments = TRUE;
|
||||
|
||||
fMatchOpenParen = -1;
|
||||
fMatchCloseParen = -1;
|
||||
fMatchOpenParen = -1;
|
||||
fMatchCloseParen = -1;
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
|
@ -304,7 +305,7 @@ void RegexCompile::compile(
|
|||
|
||||
U_ASSERT(state != 0);
|
||||
|
||||
// Find the state table element that matches the input char from the rule, or the
|
||||
// Find the state table element that matches the input char from the pattern, or the
|
||||
// class of the input character. Start with the first table row for this
|
||||
// state, then linearly scan forward until we find a row that matches the
|
||||
// character. The last row for each state always matches all characters, so
|
||||
|
@ -337,7 +338,7 @@ void RegexCompile::compile(
|
|||
}
|
||||
|
||||
if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
|
||||
fC.fQuoted == FALSE && // char is not escaped &&
|
||||
fC.fQuoted == FALSE && // char is not escaped &&
|
||||
fC.fChar != (UChar32)-1) { // char is not EOF
|
||||
UnicodeSet *uniset = gRuleSets[tableEl->fCharClass-128];
|
||||
if (uniset->contains(fC.fChar)) {
|
||||
|
@ -373,6 +374,10 @@ void RegexCompile::compile(
|
|||
fStack[fStackPtr] = tableEl->fPushState;
|
||||
}
|
||||
|
||||
//
|
||||
// NextChar. This is where characters are actually fetched from the pattern.
|
||||
// Happens under control of the 'n' tag in the state table.
|
||||
//
|
||||
if (tableEl->fNextChar) {
|
||||
nextChar(fC);
|
||||
}
|
||||
|
@ -1300,9 +1305,17 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
}
|
||||
break;
|
||||
|
||||
case doSuppressComments:
|
||||
// We have just scanned a '(?'. We now need to prevent the character scanner from
|
||||
// treating a '#' as a to-the-end-of-line comment.
|
||||
// (This Perl compatibility just gets uglier and uglier to do...)
|
||||
fEOLComments = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
|
||||
default:
|
||||
U_ASSERT(FALSE);
|
||||
error(U_REGEX_INTERNAL_ERROR);
|
||||
returnVal = FALSE;
|
||||
break;
|
||||
|
@ -1863,8 +1876,8 @@ static const UChar chUpperP = 0x50;
|
|||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// nextCharLL Low Level Next Char from the regex pattern.
|
||||
// Get a char from the string,
|
||||
// keep track of input position for error reporting.
|
||||
// Get a char from the string, keep track of input position
|
||||
// for error reporting.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
UChar32 RegexCompile::nextCharLL() {
|
||||
|
@ -1929,9 +1942,6 @@ UChar32 RegexCompile::peekCharLL() {
|
|||
//---------------------------------------------------------------------------------
|
||||
void RegexCompile::nextChar(RegexPatternChar &c) {
|
||||
|
||||
// Unicode Character constants needed for the processing done by nextChar(),
|
||||
// in hex because literals wont work on EBCDIC machines.
|
||||
|
||||
fScanIndex = fNextIndex;
|
||||
c.fChar = nextCharLL();
|
||||
c.fQuoted = FALSE;
|
||||
|
@ -1944,39 +1954,60 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
|||
nextChar(c); // recurse to get the real next char
|
||||
}
|
||||
}
|
||||
else if (fInBackslashQuote) {
|
||||
// The current character immediately follows a '\'
|
||||
// Don't check for any further escapes, just return it as-is.
|
||||
// Don't set c.fQuoted, because that would prevent the state machine from
|
||||
// dispatching on the character.
|
||||
fInBackslashQuote = FALSE;
|
||||
}
|
||||
else
|
||||
{
|
||||
// We are not in a 'quoted region' of the source.
|
||||
// We are not in a \Q quoted region \E of the source.
|
||||
//
|
||||
if (fFreeForm && c.fChar == chPound) {
|
||||
// Start of a comment. Consume the rest of it.
|
||||
// The new-line char that terminates the comment is always returned.
|
||||
// It will be treated as white-space, and serves to break up anything
|
||||
// that might otherwise incorrectly clump together with a comment in
|
||||
// the middle (a variable name, for example.)
|
||||
if (fModeFlags & UREGEX_COMMENTS) {
|
||||
//
|
||||
// We are in free-spacing and comments mode.
|
||||
// Scan through any white space and comments, until we
|
||||
// reach a significant character or the end of inut.
|
||||
for (;;) {
|
||||
if (c.fChar == (UChar32)-1) {
|
||||
break; // End of Input
|
||||
}
|
||||
if (c.fChar == chPound && fEOLComments == TRUE) {
|
||||
// Start of a comment. Consume the rest of it, until EOF or a new line
|
||||
for (;;) {
|
||||
c.fChar = nextCharLL();
|
||||
if (c.fChar == (UChar32)-1 || // EOF
|
||||
c.fChar == chCR ||
|
||||
c.fChar == chLF ||
|
||||
c.fChar == chNEL ||
|
||||
c.fChar == chLS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) {
|
||||
// TODO: is RuleWhiteSpace the right thing to use here?
|
||||
break;
|
||||
}
|
||||
c.fChar = nextCharLL();
|
||||
if (c.fChar == (UChar32)-1 || // EOF
|
||||
c.fChar == chCR ||
|
||||
c.fChar == chLF ||
|
||||
c.fChar == chNEL ||
|
||||
c.fChar == chLS) {break;}
|
||||
}
|
||||
}
|
||||
if (c.fChar == (UChar32)-1) {
|
||||
return;
|
||||
}
|
||||
|
||||
//
|
||||
// check for backslash escaped characters.
|
||||
// Use UnicodeString::unescapeAt() to handle those that it can.
|
||||
// Otherwise just return the '\', and let the pattern parser deal with it.
|
||||
//
|
||||
int32_t startX = fNextIndex; // start and end positions of the
|
||||
int32_t endX = fNextIndex; // sequence following the '\'
|
||||
int32_t startX = fNextIndex; // start and end positions of the
|
||||
int32_t endX = fNextIndex; // sequence following the '\'
|
||||
if (c.fChar == chBackSlash) {
|
||||
if (gUnescapeCharSet->contains(peekCharLL())) {
|
||||
nextCharLL(); // get & discard the peeked char.
|
||||
//
|
||||
// A '\' sequence that is handled by ICU's standard unescapeAt function.
|
||||
// Includes \uxxxx, \n, \r, many others.
|
||||
// Return the single equivalent character.
|
||||
//
|
||||
nextCharLL(); // get & discard the peeked char.
|
||||
c.fQuoted = TRUE;
|
||||
c.fChar = fRXPat->fPattern.unescapeAt(endX);
|
||||
if (startX == endX) {
|
||||
|
@ -1985,8 +2016,21 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
|||
fCharNum += endX - startX;
|
||||
fNextIndex = endX;
|
||||
}
|
||||
else
|
||||
{
|
||||
// We are in a '\' escape that will be handled by the state table scanner.
|
||||
// Just return the backslash, but remember that the following char is to
|
||||
// be taken literally. TODO: this is awkward
|
||||
fInBackslashQuote = TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// re-enable # to end-of-line comments, in case they were disabled..
|
||||
// They are disabled by the parser upon seeing '(?', but this lasts for
|
||||
// the fetching of the next character only.
|
||||
fEOLComments = TRUE;
|
||||
|
||||
// putc(c.fChar, stdout);
|
||||
}
|
||||
|
||||
|
|
|
@ -118,8 +118,10 @@ private:
|
|||
// in the rule input string.
|
||||
int32_t fNextIndex; // Index of the next character, which
|
||||
// is the first character not yet scanned.
|
||||
UBool fQuoteMode; // Scan is in a quoted region
|
||||
UBool fFreeForm; // Scan mode is free-form, ignore spaces.
|
||||
UBool fQuoteMode; // Scan is in a \Q...\E quoted region
|
||||
UBool fInBackslashQuote; // Scan is between a '\' and the following char.
|
||||
UBool fEOLComments; // When scan is just after '(?', inhibit #... to
|
||||
// end of line comments, in favor of (?#...) comments.
|
||||
int fLineNum; // Line number in input file.
|
||||
int fCharNum; // Char position within the line.
|
||||
UChar32 fLastChar; // Previous char, needed to count CR-LF
|
||||
|
|
|
@ -60,6 +60,7 @@ enum Regex_PatternParseAction {
|
|||
doPossesiveOpt,
|
||||
doEscapeError,
|
||||
doBackslashG,
|
||||
doSuppressComments,
|
||||
doMatchModeParen,
|
||||
doOpt,
|
||||
doInterval,
|
||||
|
@ -127,11 +128,11 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 21
|
||||
, {doNOP, 255, 2,0, FALSE} // 22
|
||||
, {doNOP, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant
|
||||
, {doSuppressComments, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant
|
||||
, {doNOP, 255, 27,0, FALSE} // 24
|
||||
, {doNOP, 35 /* # */, 46, 14, TRUE} // 25 open-paren-quant2
|
||||
, {doNOP, 255, 29,0, FALSE} // 26
|
||||
, {doNOP, 63 /* ? */, 29,0, TRUE} // 27 open-paren
|
||||
, {doSuppressComments, 63 /* ? */, 29,0, TRUE} // 27 open-paren
|
||||
, {doOpenCaptureParen, 255, 2, 14, FALSE} // 28
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 2, 14, TRUE} // 29 open-paren-extended
|
||||
, {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE} // 30
|
||||
|
|
|
@ -64,7 +64,7 @@ start:
|
|||
# term. At a position where we can accept the start most items in a pattern.
|
||||
#
|
||||
term:
|
||||
quoted n expr-quant doLiteralChar
|
||||
quoted n expr-quant doLiteralChar
|
||||
rule_char n expr-quant doLiteralChar
|
||||
'[' n expr-quant doScanUnicodeSet
|
||||
'(' n open-paren
|
||||
|
@ -109,7 +109,7 @@ expr-cont:
|
|||
# branches into the normal parenthesis sequence as quickly as possible.
|
||||
#
|
||||
open-paren-quant:
|
||||
'?' n open-paren-quant2
|
||||
'?' n open-paren-quant2 doSuppressComments
|
||||
default open-paren
|
||||
|
||||
open-paren-quant2:
|
||||
|
@ -122,7 +122,7 @@ open-paren-quant2:
|
|||
# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
|
||||
#
|
||||
open-paren:
|
||||
'?' n open-paren-extended
|
||||
'?' n open-paren-extended doSuppressComments
|
||||
default term ^expr-quant doOpenCaptureParen
|
||||
|
||||
open-paren-extended:
|
||||
|
|
|
@ -844,9 +844,11 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
// We are at the start input. Success.
|
||||
break;
|
||||
}
|
||||
// Check the character just before the current pos.
|
||||
// Check whether character just before the current pos is a new-line
|
||||
// unless we are at the end of input
|
||||
UChar c = inputBuf[fp->fInputIdx - 1];
|
||||
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
|
||||
if ((fp->fInputIdx < inputLen) &&
|
||||
(c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
|
||||
// It's a new-line. ^ is true. Success.
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -215,7 +215,7 @@ RegexPattern *RegexPattern::compile(
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if ((flags & (UREGEX_CANON_EQ | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE)) != 0) {
|
||||
if ((flags & UREGEX_CANON_EQ) != 0) {
|
||||
status = U_REGEX_UNIMPLEMENTED;
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
@ -394,9 +394,12 @@ void RegexTest::Basic() {
|
|||
//
|
||||
#if 0
|
||||
{
|
||||
REGEX_TESTLM("(a)|\\1", "x", FALSE, FALSE);
|
||||
// REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
|
||||
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
|
||||
// REGEX_TESTLM("^a (?#xxx) (?#yyy) {3}c", "accc", FALSE, FALSE);
|
||||
UParseError pe;
|
||||
UErrorCode status;
|
||||
RegexPattern::compile("^a (?#xxx) (?#yyy) {3}c", UREGEX_COMMENTS, pe, status);
|
||||
// REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
|
||||
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
|
||||
}
|
||||
exit(1);
|
||||
#endif
|
||||
|
|
14
icu4c/source/test/testdata/regextst.txt
vendored
14
icu4c/source/test/testdata/regextst.txt
vendored
|
@ -216,3 +216,17 @@
|
|||
"(?:(?i)a)b" "<0>Ab</0>"
|
||||
"ab(?i)cd" "<0>abCd</0>"
|
||||
"ab$cd" "abcd"
|
||||
|
||||
# White space handling
|
||||
"a b" "ab"
|
||||
"abc " "abc"
|
||||
"abc " "<0>abc </0>"
|
||||
#"ab[cd e]z" "<0>ab z</0>" #TODO: white space handling in Unicode Sets.
|
||||
|
||||
|
||||
#Multi-line mode
|
||||
'b\s^' m "a\nb\n"
|
||||
|
||||
# Free-spacing mode
|
||||
"a b c # this is a comment" x "<0>abc</0> "
|
||||
'^a (?#xxx) (?#yyy) {3}c' x "<0>aaac</0>"
|
||||
|
|
Loading…
Add table
Reference in a new issue