ICU-105 Regular Expressions, ongoing development

X-SVN-Rev: 10220
This commit is contained in:
Andy Heninger 2002-11-11 18:49:49 +00:00
parent 1560a2fb59
commit fa16d0f578
8 changed files with 552 additions and 336 deletions

View file

@ -244,13 +244,13 @@ void RegexCompile::compile(
//
tableEl = &gRuleParseStateTable[state];
if (RESCAN_DEBUG) {
printf("char, line, col = (\'%c\', %d, %d) state=%s ",
printf( "char, line, col = (\'%c\', %d, %d) state=%s ",
fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);
}
for (;;) { // loop through table rows belonging to this state, looking for one
// that matches the current input char.
if (RESCAN_DEBUG) { printf(".");}
if (RESCAN_DEBUG) { printf( ".");}
if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
// Table row specified an individual character, not a set, and
// the input character is not quoted, and
@ -284,7 +284,7 @@ void RegexCompile::compile(
// No match on this row, advance to the next row for this state,
tableEl++;
}
if (RESCAN_DEBUG) { printf("\n");}
if (RESCAN_DEBUG) { printf( "\n");}
//
// We've found the row of the state table that matches the current input
@ -301,7 +301,7 @@ void RegexCompile::compile(
fStackPtr++;
if (fStackPtr >= kStackSize) {
error(U_REGEX_INTERNAL_ERROR);
printf("RegexCompile::parse() - state stack overflow.\n");
// printf( "RegexCompile::parse() - state stack overflow.\n");
fStackPtr--;
}
fStack[fStackPtr] = tableEl->fPushState;
@ -319,9 +319,12 @@ void RegexCompile::compile(
state = fStack[fStackPtr];
fStackPtr--;
if (fStackPtr < 0) {
error(U_REGEX_INTERNAL_ERROR);
printf("RegexCompile::compile() - state stack underflow.\n");
// state stack underflow
// This will occur if the user pattern has mis-matched parentheses,
// with extra close parens.
//
fStackPtr++;
error(U_REGEX_MISMATCHED_PAREN);
}
}
@ -637,94 +640,12 @@ UBool RegexCompile::doParseActions(EParseAction action)
break;
case doStartString:
// We've just scanned a single "normal" character from the pattern,
// which is a character without special meaning that will need to be
// matched literally. Save it away. It may be the start of a string.
{
fStringOpStart = fRXPat->fLiteralText.length();
fRXPat->fLiteralText.append(fC.fChar);
break;
}
case doStringChar:
// We've just scanned a "normal" character from the pattern, which now
// needs to be appended the the literal match string being that is
// already being assembled.
{
fRXPat->fLiteralText.append(fC.fChar);
break;
}
case doSplitString:
// We've just peeked at a quantifier, e.g. a *, following a scanned string.
// Separate the last character from the string, because the quantifier
// only applies to it, not to the entire string. Emit into the compiled
// pattern:
// - string chars[0..n-2] (as a string, assuming more than one char)
// - string char [n-1] (as a single character)
{
// Locate the positions of the last and next-to-last characters
// in the string. Requires a bit of futzing around to account for
// surrogate pairs, since we want 32 bit code points, not 16 bit code units.
int32_t strLength = fRXPat->fLiteralText.length() - fStringOpStart;
U_ASSERT(strLength > 0);
int32_t lastCharIdx = fRXPat->fLiteralText.length()-1;
lastCharIdx = fRXPat->fLiteralText.getChar32Start(lastCharIdx);
int32_t nextToLastCharIdx = lastCharIdx-1;
if (nextToLastCharIdx > fStringOpStart) {
nextToLastCharIdx = fRXPat->fLiteralText.getChar32Start(nextToLastCharIdx);
}
if (nextToLastCharIdx > fStringOpStart) {
// The string contains three or more code units.
// emit the first through the next-to-last as a string.
int32_t stringToken = URX_BUILD(URX_STRING, fStringOpStart);
fRXPat->fCompiledPat->addElement(stringToken, *fStatus);
stringToken = URX_BUILD(URX_STRING_LEN, lastCharIdx - fStringOpStart);
fRXPat->fCompiledPat->addElement(stringToken, *fStatus);
}
else if (nextToLastCharIdx == fStringOpStart) {
// The string contains exactly two code units.
// emit the first into the compiled pattern as a single char
UChar32 c = fRXPat->fLiteralText.char32At(nextToLastCharIdx);
int32_t charToken = URX_BUILD(URX_ONECHAR, c);
fRXPat->fCompiledPat->addElement(charToken, *fStatus);
}
// In all cases emit the last char as a single character.
UChar32 c = fRXPat->fLiteralText.char32At(lastCharIdx);
int32_t charToken = URX_BUILD(URX_ONECHAR, c);
fRXPat->fCompiledPat->addElement(charToken, *fStatus);
}
case doLiteralChar:
// We've just scanned a "normal" character from the pattern,
literalChar();
break;
case doEndString:
// We have reached the end of a literal string in the pattern.
// Emit the string token into the compiled pattern, or if the string
// has only one character, emit the single character token instead.
{
int32_t strLength = fRXPat->fLiteralText.length() - fStringOpStart;
U_ASSERT(strLength > 0);
int32_t lastCharIdx = fRXPat->fLiteralText.length()-1;
lastCharIdx = fRXPat->fLiteralText.getChar32Start(lastCharIdx);
if (lastCharIdx == fStringOpStart) {
// The string contains exactly one character.
// Emit it into the compiled pattern as a single char.
int32_t charToken = URX_BUILD(URX_ONECHAR, fRXPat->fLiteralText.char32At(fStringOpStart));
fRXPat->fCompiledPat->addElement(charToken, *fStatus);
} else {
// The string contains two or more chars. Emit as a string.
// Compiled string consumes two tokens in the compiled pattern, one
// for the index of the start-of-string, and one for the length.
int32_t stringToken = URX_BUILD(URX_STRING, fStringOpStart);
fRXPat->fCompiledPat->addElement(stringToken, *fStatus);
stringToken = URX_BUILD(URX_STRING_LEN, strLength);
fRXPat->fCompiledPat->addElement(stringToken, *fStatus);
}
}
break;
case doDotAny:
@ -858,6 +779,151 @@ UBool RegexCompile::doParseActions(EParseAction action)
};
//------------------------------------------------------------------------------
//
// literalChar We've encountered a literal character from the pattern,
// or an escape sequence that reduces to a character.
// Add it to the string containing all literal chars/strings from
// the pattern.
// If we are in a pattern string already, add the new char to it.
// If we aren't in a pattern string, begin one now.
//
//------------------------------------------------------------------------------
void RegexCompile::literalChar() {
int32_t op; // An operation in the compiled pattern.
int32_t opType;
int32_t patternLoc; // A position in the compiled pattern.
int32_t stringLen;
// If the last thing compiled into the pattern was not a literal char,
// force this new literal char to begin a new string, and not append to the previous.
op = fRXPat->fCompiledPat->lastElementi();
opType = URX_TYPE(op);
if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR)) {
fixLiterals();
}
if (fStringOpStart == -1) {
// First char of a string in the pattern.
// Emit a OneChar op into the compiled pattern.
op = URX_BUILD(URX_ONECHAR, fC.fChar);
fRXPat->fCompiledPat->addElement(op, *fStatus);
// Also add it to the string pool, in case we get a second adjacent literal
// and want to change form ONE_CHAR to STRING
fStringOpStart = fRXPat->fLiteralText.length();
fRXPat->fLiteralText.append(fC.fChar);
return;
}
// We are adding onto an existing string
fRXPat->fLiteralText.append(fC.fChar);
// If the most recently emitted op is a URX_ONECHAR, change it to a string op.
op = fRXPat->fCompiledPat->lastElementi();
opType = URX_TYPE(op);
U_ASSERT(opType == URX_ONECHAR || opType == URX_STRING_LEN);
if (opType == URX_ONECHAR) {
op = URX_BUILD(URX_STRING, fStringOpStart);
patternLoc = fRXPat->fCompiledPat->size() - 1;
fRXPat->fCompiledPat->setElementAt(op, patternLoc);
op = URX_BUILD(URX_STRING_LEN, 0);
fRXPat->fCompiledPat->addElement(op, *fStatus);
}
// The pattern contains a URX_SRING / URX_STRING_LEN. Update the
// string length to reflect the new char we just added to the string.
stringLen = fRXPat->fLiteralText.length() - fStringOpStart;
op = URX_BUILD(URX_STRING_LEN, stringLen);
patternLoc = fRXPat->fCompiledPat->size() - 1;
fRXPat->fCompiledPat->setElementAt(op, patternLoc);
}
//------------------------------------------------------------------------------
//
// fixLiterals When compiling something that can follow a literal
// string in a pattern, we need to "fix" any preceding
// string, which will cause any subsequent literals to
// begin a new string, rather than appending to the
// old one.
//
// Optionally, split the last char of the string off into
// a single "ONE_CHAR" operation, so that quantifiers can
// apply to that char alone. Example: abc*
// The * needs to apply to the 'c' only.
//
//------------------------------------------------------------------------------
void RegexCompile::fixLiterals(UBool split) {
int32_t stringStart = fStringOpStart; // start index of the current literal string
int32_t op; // An op from/for the compiled pattern.
int32_t opType; // An opcode type from the compiled pattern.
int32_t stringLastCharIdx;
UChar32 lastChar;
int32_t stringNextToLastCharIdx;
UChar32 nextToLastChar;
int32_t stringLen;
fStringOpStart = -1;
if (!split) {
return;
}
// Split: We need to ensure that the last item in the compiled pattern does
// not refer to a literal string of more than one char. If it does,
// separate the last char from the rest of the string.
// If the last operation from the compiled pattern is not a string,
// nothing needs to be done
op = fRXPat->fCompiledPat->lastElementi();
opType = URX_TYPE(op);
if (opType != URX_STRING_LEN) {
return;
}
stringLen = URX_VAL(op);
//
// Find the position of the last code point in the string (might be a surrogate pair)
//
stringLastCharIdx = fRXPat->fLiteralText.length();
stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
lastChar = fRXPat->fLiteralText.char32At(stringLastCharIdx);
// The string should always be at least two code points long, meaning that there
// should be something before the last char position that we just found.
U_ASSERT(stringLastCharIdx > stringStart);
stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
U_ASSERT(stringNextToLastCharIdx >= stringStart);
nextToLastChar = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx);
if (stringNextToLastCharIdx > stringStart) {
// The length of string remaining after removing one char is two or more.
// Leave the string in the compiled pattern, shorten it by one char,
// and append a URX_ONECHAR op for the last char.
stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx);
op = URX_BUILD(URX_STRING_LEN, stringLen);
fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
op = URX_BUILD(URX_ONECHAR, lastChar);
fRXPat->fCompiledPat->addElement(op, *fStatus);
} else {
// The original string consisted of exactly two characters. Replace
// the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
// of URX_ONECHARs.
op = URX_BUILD(URX_ONECHAR, nextToLastChar);
fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -2);
op = URX_BUILD(URX_ONECHAR, lastChar);
fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
}
}
//------------------------------------------------------------------------------
//
// blockTopLoc() Find or create a location in the compiled pattern
@ -889,6 +955,7 @@ int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
// Item just compiled is a single thing, a ".", or a single char, or a set reference.
// No slot for STATE_SAVE was pre-reserved in the compiled code.
// We need to make space now.
fixLiterals(TRUE); // If last item was a string, separate the last char.
theLoc = fRXPat->fCompiledPat->size()-1;
if (reserveLoc) {
int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
@ -922,6 +989,10 @@ void RegexCompile::handleCloseParen() {
return;
}
// Force any literal chars that may follow the close paren to start a new string,
// and not attach to any preceding it.
fixLiterals(FALSE);
// Fixup any operations within the just-closed parenthesized group
// that need to reference the end of the (block).
// (The first one on popped from the stack is an unused slot for
@ -1211,7 +1282,7 @@ UnicodeSet *RegexCompile::scanSet() {
if (U_FAILURE(localStatus)) {
// TODO: Get more accurate position of the error from UnicodeSet's return info.
// UnicodeSet appears to not be reporting correctly at this time.
printf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
printf( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
error(localStatus);
delete uset;
return NULL;

View file

@ -97,6 +97,8 @@ private:
// there is space to add an opcode there.
void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for
// a reference to a UnicodeSet.
void literalChar(); // Compile a literal char
void fixLiterals(UBool split=FALSE); // Fix literal strings.
UErrorCode *fStatus;

View file

@ -29,7 +29,6 @@ enum Regex_PatternParseAction {
doBadOpenParenType,
doRuleError,
doBackslashs,
doStartString,
doNGOpt,
doNamedChar,
doBackslashw,
@ -55,9 +54,9 @@ enum Regex_PatternParseAction {
doPossesiveOpt,
doBackslashG,
doOpt,
doLiteralChar,
doOpenAtomicParen,
doBackslashS,
doStringChar,
doOpenLookAhead,
doBackRef,
doDollar,
@ -70,11 +69,9 @@ enum Regex_PatternParseAction {
doExit,
doPatStart,
doBackslashb,
doEndString,
doBackslashd,
doNotImplementedError,
doBackslashd,
doOpenLookBehindNeg,
doSplitString,
rbbiLastAction};
//-------------------------------------------------------------------------------
@ -94,97 +91,88 @@ struct RegexTableEl {
static const struct RegexTableEl gRuleParseStateTable[] = {
{doNOP, 0, 0, 0, TRUE}
, {doPatStart, 255, 3, 2, FALSE} // 1 start
, {doPatFinish, 255, 2,0, FALSE} // 2 finish
, {doStartString, 254, 13,0, TRUE} // 3 term
, {doStartString, 130, 13,0, TRUE} // 4
, {doScanUnicodeSet, 91 /* [ */, 20,0, TRUE} // 5
, {doNOP, 40 /* ( */, 28, 20, TRUE} // 6
, {doDotAny, 46 /* . */, 20,0, TRUE} // 7
, {doCaret, 94 /* ^ */, 3,0, TRUE} // 8
, {doDollar, 36 /* $ */, 3,0, TRUE} // 9
, {doNOP, 92 /* \ */, 67,0, TRUE} // 10
, {doNOP, 253, 2,0, FALSE} // 11
, {doRuleError, 255, 87,0, FALSE} // 12
, {doStringChar, 254, 13,0, TRUE} // 13 string
, {doStringChar, 130, 13,0, TRUE} // 14
, {doSplitString, 63 /* ? */, 20,0, FALSE} // 15
, {doSplitString, 43 /* + */, 20,0, FALSE} // 16
, {doSplitString, 42 /* * */, 20,0, FALSE} // 17
, {doSplitString, 123 /* { */, 20,0, FALSE} // 18
, {doEndString, 255, 20,0, FALSE} // 19
, {doNOP, 42 /* * */, 56,0, TRUE} // 20 expr-quant
, {doNOP, 43 /* + */, 59,0, TRUE} // 21
, {doNOP, 63 /* ? */, 62,0, TRUE} // 22
, {doNOP, 123 /* { */, 65,0, TRUE} // 23
, {doNOP, 255, 25,0, FALSE} // 24
, {doOrOperator, 124 /* | */, 3,0, TRUE} // 25 expr-cont
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 26
, {doNOP, 255, 3,0, FALSE} // 27
, {doNOP, 63 /* ? */, 30,0, TRUE} // 28 open-paren
, {doOpenCaptureParen, 255, 3, 20, FALSE} // 29
, {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 30 open-paren-extended
, {doOpenAtomicParen, 62 /* > */, 3, 20, TRUE} // 31
, {doOpenLookAhead, 61 /* = */, 3, 25, TRUE} // 32
, {doOpenLookAheadNeg, 33 /* ! */, 3, 25, TRUE} // 33
, {doNOP, 60 /* < */, 42,0, TRUE} // 34
, {doNOP, 35 /* # */, 45,0, TRUE} // 35
, {doMatchMode, 105 /* i */, 48,0, TRUE} // 36
, {doMatchMode, 120 /* x */, 48,0, TRUE} // 37
, {doMatchMode, 115 /* s */, 48,0, TRUE} // 38
, {doMatchMode, 109 /* m */, 48,0, TRUE} // 39
, {doMatchMode, 45 /* - */, 48,0, TRUE} // 40
, {doBadOpenParenType, 255, 87,0, FALSE} // 41
, {doOpenLookBehind, 61 /* = */, 3, 25, TRUE} // 42 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 3, 25, TRUE} // 43
, {doBadOpenParenType, 255, 87,0, FALSE} // 44
, {doNOP, 41 /* ) */, 3,0, TRUE} // 45 paren-comment
, {doMismatchedParenErr, 253, 87,0, FALSE} // 46
, {doNOP, 255, 45,0, TRUE} // 47
, {doMatchMode, 105 /* i */, 48,0, TRUE} // 48 paren-flag
, {doMatchMode, 115 /* s */, 48,0, TRUE} // 49
, {doMatchMode, 109 /* m */, 48,0, TRUE} // 50
, {doMatchMode, 120 /* x */, 48,0, TRUE} // 51
, {doMatchMode, 45 /* - */, 48,0, TRUE} // 52
, {doNOP, 41 /* ) */, 3,0, TRUE} // 53
, {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 54
, {doNOP, 255, 87,0, FALSE} // 55
, {doNGStar, 63 /* ? */, 25,0, TRUE} // 56 quant-star
, {doPossesiveStar, 43 /* + */, 25,0, TRUE} // 57
, {doStar, 255, 25,0, FALSE} // 58
, {doNGPlus, 63 /* ? */, 25,0, TRUE} // 59 quant-plus
, {doPossesivePlus, 43 /* + */, 25,0, TRUE} // 60
, {doPlus, 255, 25,0, FALSE} // 61
, {doNGOpt, 63 /* ? */, 25,0, TRUE} // 62 quant-opt
, {doPossesiveOpt, 43 /* + */, 25,0, TRUE} // 63
, {doOpt, 255, 25,0, FALSE} // 64
, {doNOP, 129, 65,0, TRUE} // 65 interval-open
, {doNotImplementedError, 255, 87,0, FALSE} // 66
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 67 backslash
, {doBackslashB, 66 /* B */, 3,0, TRUE} // 68
, {doBackslashb, 98 /* b */, 3,0, TRUE} // 69
, {doBackslashd, 100 /* d */, 20,0, TRUE} // 70
, {doBackslashD, 68 /* D */, 20,0, TRUE} // 71
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 72
, {doNamedChar, 78 /* N */, 20,0, TRUE} // 73
, {doProperty, 112 /* p */, 20,0, FALSE} // 74
, {doProperty, 80 /* P */, 20,0, FALSE} // 75
, {doEnterQuoteMode, 81 /* Q */, 3,0, TRUE} // 76
, {doBackslashS, 83 /* S */, 20,0, TRUE} // 77
, {doBackslashs, 115 /* s */, 20,0, TRUE} // 78
, {doBackslashW, 87 /* W */, 20,0, TRUE} // 79
, {doBackslashw, 119 /* w */, 20,0, TRUE} // 80
, {doBackslashX, 88 /* X */, 20,0, TRUE} // 81
, {doBackslashx, 120 /* x */, 20,0, TRUE} // 82
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 83
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 84
, {doBackRef, 128, 20,0, TRUE} // 85
, {doStartString, 255, 13,0, TRUE} // 86
, {doExit, 255, 87,0, TRUE} // 87 errorDeath
, {doPatStart, 255, 2,0, FALSE} // 1 start
, {doLiteralChar, 254, 12,0, TRUE} // 2 term
, {doLiteralChar, 130, 12,0, TRUE} // 3
, {doScanUnicodeSet, 91 /* [ */, 12,0, TRUE} // 4
, {doNOP, 40 /* ( */, 20,0, TRUE} // 5
, {doDotAny, 46 /* . */, 12,0, TRUE} // 6
, {doCaret, 94 /* ^ */, 2,0, TRUE} // 7
, {doDollar, 36 /* $ */, 2,0, TRUE} // 8
, {doNOP, 92 /* \ */, 59,0, TRUE} // 9
, {doPatFinish, 253, 2,0, FALSE} // 10
, {doRuleError, 255, 79,0, FALSE} // 11
, {doNOP, 42 /* * */, 48,0, TRUE} // 12 expr-quant
, {doNOP, 43 /* + */, 51,0, TRUE} // 13
, {doNOP, 63 /* ? */, 54,0, TRUE} // 14
, {doNOP, 123 /* { */, 57,0, TRUE} // 15
, {doNOP, 255, 17,0, FALSE} // 16
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 17 expr-cont
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 18
, {doNOP, 255, 2,0, FALSE} // 19
, {doNOP, 63 /* ? */, 22,0, TRUE} // 20 open-paren
, {doOpenCaptureParen, 255, 2, 12, FALSE} // 21
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 22 open-paren-extended
, {doOpenAtomicParen, 62 /* > */, 2, 12, TRUE} // 23
, {doOpenLookAhead, 61 /* = */, 2, 17, TRUE} // 24
, {doOpenLookAheadNeg, 33 /* ! */, 2, 17, TRUE} // 25
, {doNOP, 60 /* < */, 34,0, TRUE} // 26
, {doNOP, 35 /* # */, 37,0, TRUE} // 27
, {doMatchMode, 105 /* i */, 40,0, TRUE} // 28
, {doMatchMode, 120 /* x */, 40,0, TRUE} // 29
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 30
, {doMatchMode, 109 /* m */, 40,0, TRUE} // 31
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 32
, {doBadOpenParenType, 255, 79,0, FALSE} // 33
, {doOpenLookBehind, 61 /* = */, 2, 17, TRUE} // 34 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE} // 35
, {doBadOpenParenType, 255, 79,0, FALSE} // 36
, {doNOP, 41 /* ) */, 2,0, TRUE} // 37 paren-comment
, {doMismatchedParenErr, 253, 79,0, FALSE} // 38
, {doNOP, 255, 37,0, TRUE} // 39
, {doMatchMode, 105 /* i */, 40,0, TRUE} // 40 paren-flag
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 41
, {doMatchMode, 109 /* m */, 40,0, TRUE} // 42
, {doMatchMode, 120 /* x */, 40,0, TRUE} // 43
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 44
, {doNOP, 41 /* ) */, 2,0, TRUE} // 45
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 46
, {doNOP, 255, 79,0, FALSE} // 47
, {doNGStar, 63 /* ? */, 17,0, TRUE} // 48 quant-star
, {doPossesiveStar, 43 /* + */, 17,0, TRUE} // 49
, {doStar, 255, 17,0, FALSE} // 50
, {doNGPlus, 63 /* ? */, 17,0, TRUE} // 51 quant-plus
, {doPossesivePlus, 43 /* + */, 17,0, TRUE} // 52
, {doPlus, 255, 17,0, FALSE} // 53
, {doNGOpt, 63 /* ? */, 17,0, TRUE} // 54 quant-opt
, {doPossesiveOpt, 43 /* + */, 17,0, TRUE} // 55
, {doOpt, 255, 17,0, FALSE} // 56
, {doNOP, 129, 57,0, TRUE} // 57 interval-open
, {doNotImplementedError, 255, 79,0, FALSE} // 58
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 59 backslash
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 60
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 61
, {doBackslashd, 100 /* d */, 12,0, TRUE} // 62
, {doBackslashD, 68 /* D */, 12,0, TRUE} // 63
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 64
, {doNamedChar, 78 /* N */, 12,0, TRUE} // 65
, {doProperty, 112 /* p */, 12,0, FALSE} // 66
, {doProperty, 80 /* P */, 12,0, FALSE} // 67
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 68
, {doBackslashS, 83 /* S */, 12,0, TRUE} // 69
, {doBackslashs, 115 /* s */, 12,0, TRUE} // 70
, {doBackslashW, 87 /* W */, 12,0, TRUE} // 71
, {doBackslashw, 119 /* w */, 12,0, TRUE} // 72
, {doBackslashX, 88 /* X */, 12,0, TRUE} // 73
, {doBackslashx, 120 /* x */, 12,0, TRUE} // 74
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 75
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 76
, {doBackRef, 128, 12,0, TRUE} // 77
, {doLiteralChar, 255, 12,0, TRUE} // 78
, {doExit, 255, 79,0, TRUE} // 79 errorDeath
};
static const char *RegexStateNames[] = { 0,
"start",
"finish",
"term",
0,
0,
@ -194,13 +182,6 @@ static const char *RegexStateNames[] = { 0,
0,
0,
0,
0,
"string",
0,
0,
0,
0,
0,
0,
"expr-quant",
0,

View file

@ -55,50 +55,27 @@
# start state, scan position is at the beginning of the pattern.
#
start:
default term ^finish doPatStart
default term doPatStart
#
# finish - We've scanned off the end of the pattern string.
# The "doPatFinish" action will stop the pattern scanning state machine.
#
finish:
default finish doPatFinish
#
# term. Eat through a single rule character, or a composite thing, which
# could be a parenthesized expression or a Unicode Set.
# term. At a position where we can accept the start most items in a pattern.
#
term:
quoted n string doStartString
rule_char n string doStartString
'[' n expr-quant doScanUnicodeSet
'(' n open-paren ^expr-quant
quoted n expr-quant doLiteralChar
rule_char n expr-quant doLiteralChar
'[' n expr-quant doScanUnicodeSet
'(' n open-paren
'.' n expr-quant doDotAny
'^' n term doCaret
'$' n term doDollar
'\' n backslash
eof finish
eof term doPatFinish
default errorDeath doRuleError
#
# string We've encountered a literal character, or an escaped character.
# Continue with any additional literal chars, building the sequence
# into a string.
#
string:
quoted n string doStringChar
rule_char n string doStringChar
# If the string ends in a quatinfier, we need to split off the last character so that
# the quantifier effects only it, and not the entire string. (e.g. "ABC*")
'?' expr-quant doSplitString
'+' expr-quant doSplitString
'*' expr-quant doSplitString
'{' expr-quant doSplitString
default expr-quant doEndString
#
# expr-quant We've just finished scanning a term, now look for the optional
@ -223,12 +200,12 @@ backslash:
'A' n term doBackslashA
'B' n term doBackslashB
'b' n term doBackslashb
'd' n expr-quant doBackslashd
'd' n expr-quant doBackslashd
'D' n expr-quant doBackslashD
'G' n term doBackslashG
'N' n expr-quant doNamedChar # \N{NAME} named char
'p' expr-quant doProperty # \p{Lu} style property
'P' expr-quant doProperty
'N' n expr-quant doNamedChar # \N{NAME} named char
'p' expr-quant doProperty # \p{Lu} style property
'P' expr-quant doProperty
'Q' n term doEnterQuoteMode
'S' n expr-quant doBackslashS
's' n expr-quant doBackslashs
@ -238,9 +215,8 @@ backslash:
'x' n expr-quant doBackslashx
'Z' n term doBackslashZ
'z' n term doBackslashz
digit_char n expr-quant doBackRef
default n string doStartString
digit_char n expr-quant doBackRef
default n expr-quant doLiteralChar # Escaped literal char.

View file

@ -14,36 +14,38 @@
//
// Opcode types In the compiled form of the regex, these are the type, or opcodes,
// Opcode types In the compiled form of the regexp, these are the type, or opcodes,
// of the entries.
//
static const uint32_t URX_UNUSED1 = 1;
static const uint32_t URX_END = 2;
static const uint32_t URX_ONECHAR = 3; // Value field is the 21 bit unicode char to match
static const uint32_t URX_STRING = 4; // Value field is index of string start
static const uint32_t URX_STRING_LEN = 5; // Value field is string length (code units)
static const uint32_t URX_STATE_SAVE = 6; // Value field is pattern position to push
static const uint32_t URX_NOP = 7;
static const uint32_t URX_START_CAPTURE = 8; // Value field is capture group number.
static const uint32_t URX_END_CAPTURE = 9; // Value field is capture group number
static const uint32_t URX_STATIC_SETREF = 10; // Value field is index of set in array of sets.
static const uint32_t URX_SETREF = 11; // Value field is index of set in array of sets.
static const uint32_t URX_DOTANY = 12;
static const uint32_t URX_JMP = 13; // Value field is destination position in
enum {
URX_UNUSED1 = 1,
URX_END = 2,
URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
URX_STRING = 4, // Value field is index of string start
URX_STRING_LEN = 5, // Value field is string length (code units)
URX_STATE_SAVE = 6, // Value field is pattern position to push
URX_NOP = 7,
URX_START_CAPTURE = 8, // Value field is capture group number.
URX_END_CAPTURE = 9, // Value field is capture group number
URX_STATIC_SETREF = 10, // Value field is index of set in array of sets.
URX_SETREF = 11, // Value field is index of set in array of sets.
URX_DOTANY = 12,
URX_JMP = 13, // Value field is destination position in
// the pattern.
static const uint32_t URX_FAIL = 14; // Stop match operation; No match.
URX_FAIL = 14, // Stop match operation, No match.
static const uint32_t URX_BACKSLASH_A = 15;
static const uint32_t URX_BACKSLASH_B = 16; // Value field: 0: \b 1: \B
static const uint32_t URX_BACKSLASH_G = 17;
static const uint32_t URX_BACKSLASH_W = 18; // Value field: 0: \w 1: \W
static const uint32_t URX_BACKSLASH_X = 19;
static const uint32_t URX_BACKSLASH_Z = 20; // \z Unconditional end of line.
URX_BACKSLASH_A = 15,
URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B
URX_BACKSLASH_G = 17,
URX_BACKSLASH_W = 18, // Value field: 0: \w 1: \W
URX_BACKSLASH_X = 19,
URX_BACKSLASH_Z = 20, // \z Unconditional end of line.
static const uint32_t URX_DOTANY_ALL = 21; // ., in the . matches any mode.
static const uint32_t URX_BACKSLASH_D = 22; // Value field: 0: \d 1: \D
static const uint32_t URX_CARET = 23; // Value field: 1: multi-line mode.
static const uint32_t URX_DOLLAR = 24; // Also for \Z
URX_DOTANY_ALL = 21, // ., in the . matches any mode.
URX_BACKSLASH_D = 22, // Value field: 0: \d 1: \D
URX_CARET = 23, // Value field: 1: multi-line mode.
URX_DOLLAR = 24 // Also for \Z
};
//
@ -58,13 +60,16 @@ static const uint32_t URX_DOLLAR = 24; // Also for \Z
// Access to Unicode Sets for Perl-like composite character properties
// The sets are accessed by the match engine for things like \w (word boundary)
//
static const uint32_t URX_ISWORD_SET = 1;
static const uint32_t URX_ISALNUM_SET = 2;
static const uint32_t URX_ISALPHA_SET = 3;
static const uint32_t URX_ISSPACE_SET = 4;
static const uint32_t URX_LAST_SET = 5;
enum {
URX_ISWORD_SET = 1,
URX_ISALNUM_SET = 2,
URX_ISALPHA_SET = 3,
URX_ISSPACE_SET = 4,
URX_LAST_SET = 5,
URX_NEG_SET = 0x800000 // Flag bit to reverse sense of set
// membership test.
};
static const uint32_t URX_NEG_SET = 0x800000; // Flag bit to reverse sense of set
// membership test.
#endif

View file

@ -204,6 +204,11 @@ RegexPattern *RegexPattern::compile(
if (U_FAILURE(err)) {
return NULL;
}
if (flags != 0) {
err = U_REGEX_UNIMPLEMENTED;
return NULL;
}
RegexPattern *This = new RegexPattern;
if (This == NULL) {
err = U_MEMORY_ALLOCATION_ERROR;

View file

@ -4,10 +4,35 @@
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
//
// file: regex.h
//
// ICU Regular Expressions, API for C++
//
#ifndef REGEX_H
#define REGEX_H
/**
* \file
* \brief C++ API: Regular Expressions
*
* <h2>Regular Expression API</h2>
*
* <p>The ICU API for processing regular expressions consists of two classes,
* <code>RegexPattern</code> and <code>RegexMatcher</code>.
* <code>RegexPattern</code> objects represent a pre-processed, or compiled
* regular expression. They are created from a regular expression pattern string,
* and can be used to create <RegexMatcher> objects for the pattern. </p>
*
* <p> Class <code>RegexMatcher</code> bundles together a regular expression pattern
* and a target string to which the search pattern will be applied.
* <code>RegexMatcher</code> includes API for doing plain find or search
* operations, for search and replace operations, and for obtaining detailed
* information about bounds of a match. </p>
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
@ -25,56 +50,120 @@ class UStack;
class UnicodeSet;
//---------------------------------------------------------------------------------
//
// Flags for Regular Expression Modes.
// TODO: Move to C header once one exists.
// All flags default to off or false
// All are as defined by Java Regexps.
//
//---------------------------------------------------------------------------------
/**
* Constants for Regular Expression Match Modes.
* <p>Note that non-default match modes will not be supported until ICU 2.6</p>
* @draft ICU 2.4
*/
enum {
UREGEX_CANON_EQ = 128, // Forces normalization of pattern and strings.
UREGEX_CASE_INSENSITIVE = 2, // Enable case insensitive matching.
UREGEX_COMMENTS = 4, // Allow white space and comments within patterns
UREGEX_DOTALL = 32, // If set, "." matches line terminators.
// otherwise . matching stops at line end.
UREGEX_MULTILINE = 8, // Control behavior of "$" and "^".
// If set, recognize line terminators within string
// otherwise, match only at start and end of
// input string
UREGEX_UNICODE_CASE = 64, // If set, use full Unicode case folding for case
// insensitive matches. Otherwise, case insensitive
// matching only affects chars in the ASCII range.
// TODO: do we want to support this option at all?
UREGEX_UNIX_LINES = 1 // If set, only \n is recognized as a line terminator.
// otherwise recognize all Unicode line endings.
/** Forces normalization of pattern and strings. @draft ICU 2.4 */
UREGEX_CANON_EQ = 128,
/** Enable case insensitive matching. @draft ICU 2.4 */
UREGEX_CASE_INSENSITIVE = 2,
/** Allow white space and comments within patterns @draft ICU 2.4 */
UREGEX_COMMENTS = 4,
/** If set, '.' matches line terminators, otherwise '.' matching stops at line end.
* @draft ICU 2.4 */
UREGEX_DOTALL = 32,
/** Control behavior of "$" and "^"
* If set, recognize line terminators within string,
* otherwise, match only at start and end of input string.
* @draft ICU 2.4 */
UREGEX_MULTILINE = 8
};
//---------------------------------------------------------------------------------
//
// class RegexPattern
//
//---------------------------------------------------------------------------------
/**
* Class <code>RegexPattern</code> represents a compiled regular expression. It includes
* factory methods for creating a RegexPattern object from the source (string) form
* of a regular expression, methods for creating RegexMatchers that allow the pattern
* to be applied to input text, and a few convenience methods for simple common
* uses of regular expressions.
*
* @draft ICU 2.4
*/
class U_I18N_API RegexPattern: public UObject {
public:
/**
* default constructor. Create a RegexPattern object that refers to no actual
* pattern. Not normally needed; RegexPattern objects are usually
* created using the factory method <code>compile()</code.
*
* @draft ICU 2.4
*/
RegexPattern();
RegexPattern(const RegexPattern &other);
/**
* Copy Constructor. Create a new RegexPattern object that is equivalent
* to the source object.
* @draft ICU 2.4
*/
RegexPattern(const RegexPattern &source);
/**
* Destructor. Note that a RegexPattern object must persist so long as any
* RegexMatcher objects that were created from the RegexPattern are active.
* @draft ICU 2.4
*/
virtual ~RegexPattern();
/**
* Comparison operator. Two RegexPattern objects are considered equal if they
* were constructed from identical source patterns using the same match flag
* settings.
* @param that a RegexPattern object to compare with "this".
* @return TRUE if the objects are equavelent.
* @draft ICU 2.4
*/
UBool operator==(const RegexPattern& that) const;
/**
* Comparison operator. Two RegexPattern objects are considered equal if they
* were constructed from identical source patterns using the same match flag
* settings.
* @param that a RegexPattern object to compare with "this".
* @return TRUE if the objects are different.
* @draft ICU 2.4
*/
inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);};
RegexPattern &operator =(const RegexPattern &other);
/*
* Assignment operator. After assignment, this RegexPattern will behave identically
* to the source object.
* @draft ICU 2.4
*/
RegexPattern &operator =(const RegexPattern &source);
/*
* Create an exact copy of this RegexPattern object. Since RegexPattern is not
* intended to be subclasses, <code>clone()</code> and the copy construction are
* equivalent operations.
*/
virtual RegexPattern *clone() const;
/**
* Compiles the given regular expression into a pattern
* <p>Compiles the given regular expression in string form into a RegexPattern
* object. The compile methods, rather than the constructors, are the usual
* way that RegexPattern objects are created.</p>
*
* <p>Note that RegexPattern objects must not be deleted while RegexMatcher
* objects created from the pattern are active. RegexMatchers keep a pointer
* back to their pattern, so premature deletion of the pattern is a
* catastrophic error.</p>
*
* <p>All pattern match mode flags are set to their default values.</p>
*
* @param regex The regular expression to be compiles.
* @param pe Receives the position (line and column nubers) of any error
* within the regular expression.)
* @param err A reference to a UErrorCode to receive any errors.
* @return A regexPattern object for the compiled pattern.
*
* @draft ICU 2.4
*/
static RegexPattern *compile( const UnicodeString &regex,
UParseError &pe,
@ -83,6 +172,26 @@ public:
/**
* Compiles the given regular expression into a pattern with the given flags
*/
/**
* <p>Compiles the given regular expression in string form into a RegexPattern
* object using the specified match mode flags. The compile methods,
* rather than the constructors, are the usual way that RegexPattern objects
* are created.</p>
*
* <p>Note that RegexPattern objects must not be deleted while RegexMatcher
* objects created from the pattern are active. RegexMatchers keep a pointer
* back to their pattern, so premature deletion of the pattern is a
* catastrophic error.</p>
*
* @param regex The regular expression to be compiles.
* @param flags The match mode flags to be used.
* @param pe Receives the position (line and column nubers) of any error
* within the regular expression.)
* @param err A reference to a UErrorCode to receive any errors.
* @return A regexPattern object for the compiled pattern.
*
* @draft ICU 2.4
*/
static RegexPattern *compile( const UnicodeString &regex,
int32_t flags,
UParseError &pe,
@ -90,19 +199,41 @@ public:
/**
* Return the flags for this pattern
* Get the match mode flags that were used when compiling this pattern.
* @return the match mode flags
* @draft ICU 2.4
*/
virtual int32_t flags() const;
/*
* Creates a matcher that will match the given input against this pattern.
* Creates a RegexMatcher that will match the given input against this pattern. The
* RegexMatcher can then be used to perform match, find or replace operations
* on on the input. Note that a RegexPattern object must not be deleted while
* any RegexMatchers created from it still exist and might possibly be used again.
*
* @param input The input string to which the regular expression will be applied.
* @param err A reference to a UErrorCode to receive any errors.
* @return A RegexMatcher object for this pattern and input.
*
* @draft ICU 2.4
*/
virtual RegexMatcher *matcher(const UnicodeString &input,
UErrorCode &err) const;
/*
* Compiles the given regular expression and attempts to match the given input against it.
/**
* Test whether a string matches a regular expression. This convenience function
* both compiles the reguluar expression and applies it in a single operation.
* Note that if the same pattern needs to be applied repeatedly, this method will be
* less efficient than creating and reusing RegexPattern object.
*
* @param regex The regular expression
* @param input The string data to be matched
* @param pe Receives the position of any syntax errors within the regular expression
* @param err A reference to a UErrorCode to receive any errors.
* @return True if the regular expression exactly matches the full input string.
*
* @draft ICU 2.4
*/
static UBool matches(const UnicodeString &regex,
const UnicodeString &input,
@ -112,12 +243,13 @@ public:
/*
* Returns the regular expression from which this pattern was compiled.
* @draft ICU 2.4
*/
virtual UnicodeString pattern() const;
/*
* Split a string around matches of the pattern. Somewhat like split() form Perl.
* Split a string around matches of the pattern. Somewhat like split() from Perl.
* @param input The string to be split into fields. The field delimiters
* match the pattern (in the "this" object)
* @param dest An array of UnicodeStrings to receive the results of the split.
@ -131,6 +263,7 @@ public:
* of fields, the trailing part of the input string, including any
* field delimiters, is placed in the last destination string.
* @return The number of fields into which the input string was split.
* @draft ICU 2.4
*/
virtual int32_t split(const UnicodeString &input,
UnicodeString dest[],
@ -147,14 +280,14 @@ public:
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @draft ICU 2.2
* @draft ICU 2.4
*/
virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @draft ICU 2.2
* @draft ICU 2.4
*/
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
@ -167,12 +300,12 @@ private:
UnicodeString fPattern; // The original pattern string.
int32_t fFlags; // The flags used when compiling the pattern.
//
UVector *fCompiledPat; // The compiled, tokenized pattern.
UVector *fCompiledPat; // The compiled pattern.
UnicodeString fLiteralText; // Any literal string data from the pattern,
// after un-escaping, for use during the match.
UVector *fSets; // Any UnicodeSets referenced from the pattern.
UBool fBadState; // True if any prior error has left this
// RegexPattern unusable.
UBool fBadState; // True if some prior error has left this
// RegexPattern in an unusable state.
RegexMatcher *fMatcher; // A cached matcher for this pattern, used for
// split(), to avoid having to
@ -205,61 +338,77 @@ private:
//--------------------------------------------------------------------------------
//
// class RegexMatcher
//
//--------------------------------------------------------------------------------
class U_I18N_API RegexMatcher: public UObject {
/**
* class RegexMatcher bundles together a reular expression pattern and
* input text to which the expression can be applied. It includes methods
* for testing for matches, and for find and replace operations.
*
* @draft ICU 2.4
*/
class U_I18N_API RegexMatcher: public UObject {
public:
/* Destructor. Note that there are no public constructors; creation is
/**
* Destructor. Note that there are no public constructors; creation is
* done with RegexPattern::matcher().
*
* @draft ICU 2.4
*/
virtual ~RegexMatcher();
/*
/**
* Implements a replace operation intended to be used as part of an
* incremental find-and-replace.
*
* The input sequence, starting from the append position and ending at
* the start of the current match is appended to the destination string.
* The input string, starting from the end of the previous match and ending at
* the start of the current match, is appended to the destination string.
*
* Then the replacement string is appended to the output string,
* including handling any substitutions of captured text.
*
* The append position is set to the position of the first
* character following the match in the input string.
*
* For complete, prepackaged, non-incremental find-and-replace
* For simple, prepackaged, non-incremental find-and-replace
* operations, see replaceFirst() or replaceAll().
*
* Returns: This Matcher
* @param dest A UnicodeString to which the results of the find-and-replace are appended.
* @param replacement A UnicodeString that provides the text to be substitured for
* the input text that matched the regexp pattern. The replacement
* text may contain references to captured text from the
* input.
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
* if the replacement text specifies a capture group that
* does not exist in the pattern.
*
* @return this RegexMatcher
* @draft ICU 2.4
*
* error: Illegal state - no match yet attemtped, or last match failed.
* IndexOutOfBounds - caputure string number from replacement string.
*/
virtual RegexMatcher &appendReplacement(UnicodeString &dest,
const UnicodeString &replacement, UErrorCode &status);
/*
* This method reads characters from the input sequence,
* starting at the append position, and appends them to the
* destination string. It is intended to be invoked after one
* or more invocations of the appendReplacement method in order
* to copy the remainder of the input sequence.
/**
* As the final step in a find-and-replace operation, append the remainder
* of the input string, starting at the position following the last match,
* to the destination string. It is intended to be invoked after one
* or more invocations of the <code>RegexMatcher::appendReplacement()</code>.
*
* @param dest A UnicodeString to which the results of the find-and-replace are appended.
* @return the destination string.
* @draft ICU 2.4
*/
virtual UnicodeString &appendTail(UnicodeString &dest);
/*
* Returns the index of the last character matched, plus one.
* error: Illegal state - no match yet attemtped, or last match failed.
/**
* Find the ending position of the most recent match.
* @param status A reference to a UErrorCode to receive any errors. Possible
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed.
* @return the index of the last character matched, plus one.
* @draft ICU 2.4
*/
virtual int32_t end(UErrorCode &err) const;
virtual int32_t end(UErrorCode &status) const;
/*

View file

@ -367,7 +367,7 @@ void RegexTest::Basic() {
//
#if 0
{
REGEX_FIND("\\D+", "<0>non digits</0>");
REGEX_FIND("(?:ABC)+", "<0>ABCABCABC</0>D");
}
exit(1);
#endif
@ -856,17 +856,21 @@ void RegexTest::API_Pattern() {
RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
REGEX_ASSERT(*pat1a == *pat1);
#if 0
// Compile with different flags should be not equal
RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(*pat1b != *pat1a);
REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
REGEX_ASSERT(pat1a->flags() == 0);
delete pat1b;
#endif // add test back in when we actually support flag settings.
// clone
RegexPattern *pat1c = pat1b->clone();
REGEX_ASSERT(*pat1b == *pat1c);
REGEX_ASSERT(*pat1a != *pat1c);
RegexPattern *pat1c = pat1->clone();
REGEX_ASSERT(*pat1c == *pat1);
REGEX_ASSERT(*pat1c != *pat2);
// TODO: Actually do some matches with the cloned/copied/assigned patterns.
@ -874,7 +878,6 @@ void RegexTest::API_Pattern() {
delete pat1c;
delete pat1b;
delete pat1a;
delete pat1;
delete pat2;
@ -1081,6 +1084,18 @@ void RegexTest::Extended() {
// (?# comment) doesn't muck up pattern
REGEX_FIND("Hello (?# this is a comment) world", " <0>Hello world</0>...");
// Check some implementation corner cases base on the way literal strings are compiled.
REGEX_FIND("A", "<0>A</0>");
REGEX_FIND("AB", "<0>AB</0>ABABAB");
REGEX_FIND("AB+", "<0>ABBB</0>A");
REGEX_FIND("AB+", "<0>AB</0>ABAB");
REGEX_FIND("ABC+", "<0>ABC</0>ABC");
REGEX_FIND("ABC+", "<0>ABCCCC</0>ABC");
REGEX_FIND("(?:ABC)+", "<0>ABCABCABC</0>D");
REGEX_FIND("(?:ABC)DEF+", "<0>ABCDEFFF</0>D");
REGEX_FIND("AB\\.C\\eD\\u0666E", "<0>AB.C\\u001BD\\u0666E</0>F");
}
@ -1123,6 +1138,18 @@ void RegexTest::Errors() {
// {Numeric Quantifiers}
REGEX_ERR("abc{4}", 1, 5, U_REGEX_UNIMPLEMENTED);
// Attempt to use non-default flags
{
UParseError pe;
UErrorCode status = U_ZERO_ERROR;
int32_t flags = UREGEX_CASE_INSENSITIVE | UREGEX_CANON_EQ |
UREGEX_COMMENTS | UREGEX_DOTALL |
UREGEX_MULTILINE;
RegexPattern *pat1= RegexPattern::compile(".*", UREGEX_CASE_INSENSITIVE, pe, status);
REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
delete pat1;
}
// Quantifiers are allowed only after something that can be quantified.
REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);