mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-16 18:25:57 +00:00
ICU-105 Regular Expressions, ongoing development
X-SVN-Rev: 10220
This commit is contained in:
parent
1560a2fb59
commit
fa16d0f578
8 changed files with 552 additions and 336 deletions
|
@ -244,13 +244,13 @@ void RegexCompile::compile(
|
|||
//
|
||||
tableEl = &gRuleParseStateTable[state];
|
||||
if (RESCAN_DEBUG) {
|
||||
printf("char, line, col = (\'%c\', %d, %d) state=%s ",
|
||||
printf( "char, line, col = (\'%c\', %d, %d) state=%s ",
|
||||
fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);
|
||||
}
|
||||
|
||||
for (;;) { // loop through table rows belonging to this state, looking for one
|
||||
// that matches the current input char.
|
||||
if (RESCAN_DEBUG) { printf(".");}
|
||||
if (RESCAN_DEBUG) { printf( ".");}
|
||||
if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
|
||||
// Table row specified an individual character, not a set, and
|
||||
// the input character is not quoted, and
|
||||
|
@ -284,7 +284,7 @@ void RegexCompile::compile(
|
|||
// No match on this row, advance to the next row for this state,
|
||||
tableEl++;
|
||||
}
|
||||
if (RESCAN_DEBUG) { printf("\n");}
|
||||
if (RESCAN_DEBUG) { printf( "\n");}
|
||||
|
||||
//
|
||||
// We've found the row of the state table that matches the current input
|
||||
|
@ -301,7 +301,7 @@ void RegexCompile::compile(
|
|||
fStackPtr++;
|
||||
if (fStackPtr >= kStackSize) {
|
||||
error(U_REGEX_INTERNAL_ERROR);
|
||||
printf("RegexCompile::parse() - state stack overflow.\n");
|
||||
// printf( "RegexCompile::parse() - state stack overflow.\n");
|
||||
fStackPtr--;
|
||||
}
|
||||
fStack[fStackPtr] = tableEl->fPushState;
|
||||
|
@ -319,9 +319,12 @@ void RegexCompile::compile(
|
|||
state = fStack[fStackPtr];
|
||||
fStackPtr--;
|
||||
if (fStackPtr < 0) {
|
||||
error(U_REGEX_INTERNAL_ERROR);
|
||||
printf("RegexCompile::compile() - state stack underflow.\n");
|
||||
// state stack underflow
|
||||
// This will occur if the user pattern has mis-matched parentheses,
|
||||
// with extra close parens.
|
||||
//
|
||||
fStackPtr++;
|
||||
error(U_REGEX_MISMATCHED_PAREN);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -637,94 +640,12 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
break;
|
||||
|
||||
|
||||
case doStartString:
|
||||
// We've just scanned a single "normal" character from the pattern,
|
||||
// which is a character without special meaning that will need to be
|
||||
// matched literally. Save it away. It may be the start of a string.
|
||||
{
|
||||
fStringOpStart = fRXPat->fLiteralText.length();
|
||||
fRXPat->fLiteralText.append(fC.fChar);
|
||||
break;
|
||||
}
|
||||
|
||||
case doStringChar:
|
||||
// We've just scanned a "normal" character from the pattern, which now
|
||||
// needs to be appended the the literal match string being that is
|
||||
// already being assembled.
|
||||
{
|
||||
fRXPat->fLiteralText.append(fC.fChar);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
|
||||
case doSplitString:
|
||||
// We've just peeked at a quantifier, e.g. a *, following a scanned string.
|
||||
// Separate the last character from the string, because the quantifier
|
||||
// only applies to it, not to the entire string. Emit into the compiled
|
||||
// pattern:
|
||||
// - string chars[0..n-2] (as a string, assuming more than one char)
|
||||
// - string char [n-1] (as a single character)
|
||||
{
|
||||
// Locate the positions of the last and next-to-last characters
|
||||
// in the string. Requires a bit of futzing around to account for
|
||||
// surrogate pairs, since we want 32 bit code points, not 16 bit code units.
|
||||
int32_t strLength = fRXPat->fLiteralText.length() - fStringOpStart;
|
||||
U_ASSERT(strLength > 0);
|
||||
int32_t lastCharIdx = fRXPat->fLiteralText.length()-1;
|
||||
lastCharIdx = fRXPat->fLiteralText.getChar32Start(lastCharIdx);
|
||||
int32_t nextToLastCharIdx = lastCharIdx-1;
|
||||
if (nextToLastCharIdx > fStringOpStart) {
|
||||
nextToLastCharIdx = fRXPat->fLiteralText.getChar32Start(nextToLastCharIdx);
|
||||
}
|
||||
|
||||
if (nextToLastCharIdx > fStringOpStart) {
|
||||
// The string contains three or more code units.
|
||||
// emit the first through the next-to-last as a string.
|
||||
int32_t stringToken = URX_BUILD(URX_STRING, fStringOpStart);
|
||||
fRXPat->fCompiledPat->addElement(stringToken, *fStatus);
|
||||
stringToken = URX_BUILD(URX_STRING_LEN, lastCharIdx - fStringOpStart);
|
||||
fRXPat->fCompiledPat->addElement(stringToken, *fStatus);
|
||||
}
|
||||
else if (nextToLastCharIdx == fStringOpStart) {
|
||||
// The string contains exactly two code units.
|
||||
// emit the first into the compiled pattern as a single char
|
||||
UChar32 c = fRXPat->fLiteralText.char32At(nextToLastCharIdx);
|
||||
int32_t charToken = URX_BUILD(URX_ONECHAR, c);
|
||||
fRXPat->fCompiledPat->addElement(charToken, *fStatus);
|
||||
}
|
||||
// In all cases emit the last char as a single character.
|
||||
UChar32 c = fRXPat->fLiteralText.char32At(lastCharIdx);
|
||||
int32_t charToken = URX_BUILD(URX_ONECHAR, c);
|
||||
fRXPat->fCompiledPat->addElement(charToken, *fStatus);
|
||||
}
|
||||
case doLiteralChar:
|
||||
// We've just scanned a "normal" character from the pattern,
|
||||
literalChar();
|
||||
break;
|
||||
|
||||
case doEndString:
|
||||
// We have reached the end of a literal string in the pattern.
|
||||
// Emit the string token into the compiled pattern, or if the string
|
||||
// has only one character, emit the single character token instead.
|
||||
{
|
||||
int32_t strLength = fRXPat->fLiteralText.length() - fStringOpStart;
|
||||
U_ASSERT(strLength > 0);
|
||||
int32_t lastCharIdx = fRXPat->fLiteralText.length()-1;
|
||||
lastCharIdx = fRXPat->fLiteralText.getChar32Start(lastCharIdx);
|
||||
if (lastCharIdx == fStringOpStart) {
|
||||
// The string contains exactly one character.
|
||||
// Emit it into the compiled pattern as a single char.
|
||||
int32_t charToken = URX_BUILD(URX_ONECHAR, fRXPat->fLiteralText.char32At(fStringOpStart));
|
||||
fRXPat->fCompiledPat->addElement(charToken, *fStatus);
|
||||
} else {
|
||||
// The string contains two or more chars. Emit as a string.
|
||||
// Compiled string consumes two tokens in the compiled pattern, one
|
||||
// for the index of the start-of-string, and one for the length.
|
||||
int32_t stringToken = URX_BUILD(URX_STRING, fStringOpStart);
|
||||
fRXPat->fCompiledPat->addElement(stringToken, *fStatus);
|
||||
stringToken = URX_BUILD(URX_STRING_LEN, strLength);
|
||||
fRXPat->fCompiledPat->addElement(stringToken, *fStatus);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case doDotAny:
|
||||
|
@ -858,6 +779,151 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
};
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// literalChar We've encountered a literal character from the pattern,
|
||||
// or an escape sequence that reduces to a character.
|
||||
// Add it to the string containing all literal chars/strings from
|
||||
// the pattern.
|
||||
// If we are in a pattern string already, add the new char to it.
|
||||
// If we aren't in a pattern string, begin one now.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
void RegexCompile::literalChar() {
|
||||
int32_t op; // An operation in the compiled pattern.
|
||||
int32_t opType;
|
||||
int32_t patternLoc; // A position in the compiled pattern.
|
||||
int32_t stringLen;
|
||||
|
||||
|
||||
// If the last thing compiled into the pattern was not a literal char,
|
||||
// force this new literal char to begin a new string, and not append to the previous.
|
||||
op = fRXPat->fCompiledPat->lastElementi();
|
||||
opType = URX_TYPE(op);
|
||||
if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR)) {
|
||||
fixLiterals();
|
||||
}
|
||||
|
||||
if (fStringOpStart == -1) {
|
||||
// First char of a string in the pattern.
|
||||
// Emit a OneChar op into the compiled pattern.
|
||||
op = URX_BUILD(URX_ONECHAR, fC.fChar);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
|
||||
// Also add it to the string pool, in case we get a second adjacent literal
|
||||
// and want to change form ONE_CHAR to STRING
|
||||
fStringOpStart = fRXPat->fLiteralText.length();
|
||||
fRXPat->fLiteralText.append(fC.fChar);
|
||||
return;
|
||||
}
|
||||
|
||||
// We are adding onto an existing string
|
||||
fRXPat->fLiteralText.append(fC.fChar);
|
||||
|
||||
// If the most recently emitted op is a URX_ONECHAR, change it to a string op.
|
||||
op = fRXPat->fCompiledPat->lastElementi();
|
||||
opType = URX_TYPE(op);
|
||||
U_ASSERT(opType == URX_ONECHAR || opType == URX_STRING_LEN);
|
||||
if (opType == URX_ONECHAR) {
|
||||
op = URX_BUILD(URX_STRING, fStringOpStart);
|
||||
patternLoc = fRXPat->fCompiledPat->size() - 1;
|
||||
fRXPat->fCompiledPat->setElementAt(op, patternLoc);
|
||||
op = URX_BUILD(URX_STRING_LEN, 0);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
}
|
||||
|
||||
// The pattern contains a URX_SRING / URX_STRING_LEN. Update the
|
||||
// string length to reflect the new char we just added to the string.
|
||||
stringLen = fRXPat->fLiteralText.length() - fStringOpStart;
|
||||
op = URX_BUILD(URX_STRING_LEN, stringLen);
|
||||
patternLoc = fRXPat->fCompiledPat->size() - 1;
|
||||
fRXPat->fCompiledPat->setElementAt(op, patternLoc);
|
||||
}
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// fixLiterals When compiling something that can follow a literal
|
||||
// string in a pattern, we need to "fix" any preceding
|
||||
// string, which will cause any subsequent literals to
|
||||
// begin a new string, rather than appending to the
|
||||
// old one.
|
||||
//
|
||||
// Optionally, split the last char of the string off into
|
||||
// a single "ONE_CHAR" operation, so that quantifiers can
|
||||
// apply to that char alone. Example: abc*
|
||||
// The * needs to apply to the 'c' only.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
void RegexCompile::fixLiterals(UBool split) {
|
||||
int32_t stringStart = fStringOpStart; // start index of the current literal string
|
||||
int32_t op; // An op from/for the compiled pattern.
|
||||
int32_t opType; // An opcode type from the compiled pattern.
|
||||
int32_t stringLastCharIdx;
|
||||
UChar32 lastChar;
|
||||
int32_t stringNextToLastCharIdx;
|
||||
UChar32 nextToLastChar;
|
||||
int32_t stringLen;
|
||||
|
||||
fStringOpStart = -1;
|
||||
if (!split) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Split: We need to ensure that the last item in the compiled pattern does
|
||||
// not refer to a literal string of more than one char. If it does,
|
||||
// separate the last char from the rest of the string.
|
||||
|
||||
// If the last operation from the compiled pattern is not a string,
|
||||
// nothing needs to be done
|
||||
op = fRXPat->fCompiledPat->lastElementi();
|
||||
opType = URX_TYPE(op);
|
||||
if (opType != URX_STRING_LEN) {
|
||||
return;
|
||||
}
|
||||
stringLen = URX_VAL(op);
|
||||
|
||||
//
|
||||
// Find the position of the last code point in the string (might be a surrogate pair)
|
||||
//
|
||||
stringLastCharIdx = fRXPat->fLiteralText.length();
|
||||
stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
|
||||
lastChar = fRXPat->fLiteralText.char32At(stringLastCharIdx);
|
||||
|
||||
// The string should always be at least two code points long, meaning that there
|
||||
// should be something before the last char position that we just found.
|
||||
U_ASSERT(stringLastCharIdx > stringStart);
|
||||
stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
|
||||
U_ASSERT(stringNextToLastCharIdx >= stringStart);
|
||||
nextToLastChar = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx);
|
||||
|
||||
if (stringNextToLastCharIdx > stringStart) {
|
||||
// The length of string remaining after removing one char is two or more.
|
||||
// Leave the string in the compiled pattern, shorten it by one char,
|
||||
// and append a URX_ONECHAR op for the last char.
|
||||
stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx);
|
||||
op = URX_BUILD(URX_STRING_LEN, stringLen);
|
||||
fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
|
||||
op = URX_BUILD(URX_ONECHAR, lastChar);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
} else {
|
||||
// The original string consisted of exactly two characters. Replace
|
||||
// the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
|
||||
// of URX_ONECHARs.
|
||||
op = URX_BUILD(URX_ONECHAR, nextToLastChar);
|
||||
fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -2);
|
||||
op = URX_BUILD(URX_ONECHAR, lastChar);
|
||||
fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// blockTopLoc() Find or create a location in the compiled pattern
|
||||
|
@ -889,6 +955,7 @@ int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
|
|||
// Item just compiled is a single thing, a ".", or a single char, or a set reference.
|
||||
// No slot for STATE_SAVE was pre-reserved in the compiled code.
|
||||
// We need to make space now.
|
||||
fixLiterals(TRUE); // If last item was a string, separate the last char.
|
||||
theLoc = fRXPat->fCompiledPat->size()-1;
|
||||
if (reserveLoc) {
|
||||
int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
|
||||
|
@ -922,6 +989,10 @@ void RegexCompile::handleCloseParen() {
|
|||
return;
|
||||
}
|
||||
|
||||
// Force any literal chars that may follow the close paren to start a new string,
|
||||
// and not attach to any preceding it.
|
||||
fixLiterals(FALSE);
|
||||
|
||||
// Fixup any operations within the just-closed parenthesized group
|
||||
// that need to reference the end of the (block).
|
||||
// (The first one on popped from the stack is an unused slot for
|
||||
|
@ -1211,7 +1282,7 @@ UnicodeSet *RegexCompile::scanSet() {
|
|||
if (U_FAILURE(localStatus)) {
|
||||
// TODO: Get more accurate position of the error from UnicodeSet's return info.
|
||||
// UnicodeSet appears to not be reporting correctly at this time.
|
||||
printf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
|
||||
printf( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
|
||||
error(localStatus);
|
||||
delete uset;
|
||||
return NULL;
|
||||
|
|
|
@ -97,6 +97,8 @@ private:
|
|||
// there is space to add an opcode there.
|
||||
void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for
|
||||
// a reference to a UnicodeSet.
|
||||
void literalChar(); // Compile a literal char
|
||||
void fixLiterals(UBool split=FALSE); // Fix literal strings.
|
||||
|
||||
|
||||
UErrorCode *fStatus;
|
||||
|
|
|
@ -29,7 +29,6 @@ enum Regex_PatternParseAction {
|
|||
doBadOpenParenType,
|
||||
doRuleError,
|
||||
doBackslashs,
|
||||
doStartString,
|
||||
doNGOpt,
|
||||
doNamedChar,
|
||||
doBackslashw,
|
||||
|
@ -55,9 +54,9 @@ enum Regex_PatternParseAction {
|
|||
doPossesiveOpt,
|
||||
doBackslashG,
|
||||
doOpt,
|
||||
doLiteralChar,
|
||||
doOpenAtomicParen,
|
||||
doBackslashS,
|
||||
doStringChar,
|
||||
doOpenLookAhead,
|
||||
doBackRef,
|
||||
doDollar,
|
||||
|
@ -70,11 +69,9 @@ enum Regex_PatternParseAction {
|
|||
doExit,
|
||||
doPatStart,
|
||||
doBackslashb,
|
||||
doEndString,
|
||||
doBackslashd,
|
||||
doNotImplementedError,
|
||||
doBackslashd,
|
||||
doOpenLookBehindNeg,
|
||||
doSplitString,
|
||||
rbbiLastAction};
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
|
@ -94,97 +91,88 @@ struct RegexTableEl {
|
|||
|
||||
static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
{doNOP, 0, 0, 0, TRUE}
|
||||
, {doPatStart, 255, 3, 2, FALSE} // 1 start
|
||||
, {doPatFinish, 255, 2,0, FALSE} // 2 finish
|
||||
, {doStartString, 254, 13,0, TRUE} // 3 term
|
||||
, {doStartString, 130, 13,0, TRUE} // 4
|
||||
, {doScanUnicodeSet, 91 /* [ */, 20,0, TRUE} // 5
|
||||
, {doNOP, 40 /* ( */, 28, 20, TRUE} // 6
|
||||
, {doDotAny, 46 /* . */, 20,0, TRUE} // 7
|
||||
, {doCaret, 94 /* ^ */, 3,0, TRUE} // 8
|
||||
, {doDollar, 36 /* $ */, 3,0, TRUE} // 9
|
||||
, {doNOP, 92 /* \ */, 67,0, TRUE} // 10
|
||||
, {doNOP, 253, 2,0, FALSE} // 11
|
||||
, {doRuleError, 255, 87,0, FALSE} // 12
|
||||
, {doStringChar, 254, 13,0, TRUE} // 13 string
|
||||
, {doStringChar, 130, 13,0, TRUE} // 14
|
||||
, {doSplitString, 63 /* ? */, 20,0, FALSE} // 15
|
||||
, {doSplitString, 43 /* + */, 20,0, FALSE} // 16
|
||||
, {doSplitString, 42 /* * */, 20,0, FALSE} // 17
|
||||
, {doSplitString, 123 /* { */, 20,0, FALSE} // 18
|
||||
, {doEndString, 255, 20,0, FALSE} // 19
|
||||
, {doNOP, 42 /* * */, 56,0, TRUE} // 20 expr-quant
|
||||
, {doNOP, 43 /* + */, 59,0, TRUE} // 21
|
||||
, {doNOP, 63 /* ? */, 62,0, TRUE} // 22
|
||||
, {doNOP, 123 /* { */, 65,0, TRUE} // 23
|
||||
, {doNOP, 255, 25,0, FALSE} // 24
|
||||
, {doOrOperator, 124 /* | */, 3,0, TRUE} // 25 expr-cont
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 26
|
||||
, {doNOP, 255, 3,0, FALSE} // 27
|
||||
, {doNOP, 63 /* ? */, 30,0, TRUE} // 28 open-paren
|
||||
, {doOpenCaptureParen, 255, 3, 20, FALSE} // 29
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 30 open-paren-extended
|
||||
, {doOpenAtomicParen, 62 /* > */, 3, 20, TRUE} // 31
|
||||
, {doOpenLookAhead, 61 /* = */, 3, 25, TRUE} // 32
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 3, 25, TRUE} // 33
|
||||
, {doNOP, 60 /* < */, 42,0, TRUE} // 34
|
||||
, {doNOP, 35 /* # */, 45,0, TRUE} // 35
|
||||
, {doMatchMode, 105 /* i */, 48,0, TRUE} // 36
|
||||
, {doMatchMode, 120 /* x */, 48,0, TRUE} // 37
|
||||
, {doMatchMode, 115 /* s */, 48,0, TRUE} // 38
|
||||
, {doMatchMode, 109 /* m */, 48,0, TRUE} // 39
|
||||
, {doMatchMode, 45 /* - */, 48,0, TRUE} // 40
|
||||
, {doBadOpenParenType, 255, 87,0, FALSE} // 41
|
||||
, {doOpenLookBehind, 61 /* = */, 3, 25, TRUE} // 42 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 3, 25, TRUE} // 43
|
||||
, {doBadOpenParenType, 255, 87,0, FALSE} // 44
|
||||
, {doNOP, 41 /* ) */, 3,0, TRUE} // 45 paren-comment
|
||||
, {doMismatchedParenErr, 253, 87,0, FALSE} // 46
|
||||
, {doNOP, 255, 45,0, TRUE} // 47
|
||||
, {doMatchMode, 105 /* i */, 48,0, TRUE} // 48 paren-flag
|
||||
, {doMatchMode, 115 /* s */, 48,0, TRUE} // 49
|
||||
, {doMatchMode, 109 /* m */, 48,0, TRUE} // 50
|
||||
, {doMatchMode, 120 /* x */, 48,0, TRUE} // 51
|
||||
, {doMatchMode, 45 /* - */, 48,0, TRUE} // 52
|
||||
, {doNOP, 41 /* ) */, 3,0, TRUE} // 53
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 54
|
||||
, {doNOP, 255, 87,0, FALSE} // 55
|
||||
, {doNGStar, 63 /* ? */, 25,0, TRUE} // 56 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 25,0, TRUE} // 57
|
||||
, {doStar, 255, 25,0, FALSE} // 58
|
||||
, {doNGPlus, 63 /* ? */, 25,0, TRUE} // 59 quant-plus
|
||||
, {doPossesivePlus, 43 /* + */, 25,0, TRUE} // 60
|
||||
, {doPlus, 255, 25,0, FALSE} // 61
|
||||
, {doNGOpt, 63 /* ? */, 25,0, TRUE} // 62 quant-opt
|
||||
, {doPossesiveOpt, 43 /* + */, 25,0, TRUE} // 63
|
||||
, {doOpt, 255, 25,0, FALSE} // 64
|
||||
, {doNOP, 129, 65,0, TRUE} // 65 interval-open
|
||||
, {doNotImplementedError, 255, 87,0, FALSE} // 66
|
||||
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 67 backslash
|
||||
, {doBackslashB, 66 /* B */, 3,0, TRUE} // 68
|
||||
, {doBackslashb, 98 /* b */, 3,0, TRUE} // 69
|
||||
, {doBackslashd, 100 /* d */, 20,0, TRUE} // 70
|
||||
, {doBackslashD, 68 /* D */, 20,0, TRUE} // 71
|
||||
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 72
|
||||
, {doNamedChar, 78 /* N */, 20,0, TRUE} // 73
|
||||
, {doProperty, 112 /* p */, 20,0, FALSE} // 74
|
||||
, {doProperty, 80 /* P */, 20,0, FALSE} // 75
|
||||
, {doEnterQuoteMode, 81 /* Q */, 3,0, TRUE} // 76
|
||||
, {doBackslashS, 83 /* S */, 20,0, TRUE} // 77
|
||||
, {doBackslashs, 115 /* s */, 20,0, TRUE} // 78
|
||||
, {doBackslashW, 87 /* W */, 20,0, TRUE} // 79
|
||||
, {doBackslashw, 119 /* w */, 20,0, TRUE} // 80
|
||||
, {doBackslashX, 88 /* X */, 20,0, TRUE} // 81
|
||||
, {doBackslashx, 120 /* x */, 20,0, TRUE} // 82
|
||||
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 83
|
||||
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 84
|
||||
, {doBackRef, 128, 20,0, TRUE} // 85
|
||||
, {doStartString, 255, 13,0, TRUE} // 86
|
||||
, {doExit, 255, 87,0, TRUE} // 87 errorDeath
|
||||
, {doPatStart, 255, 2,0, FALSE} // 1 start
|
||||
, {doLiteralChar, 254, 12,0, TRUE} // 2 term
|
||||
, {doLiteralChar, 130, 12,0, TRUE} // 3
|
||||
, {doScanUnicodeSet, 91 /* [ */, 12,0, TRUE} // 4
|
||||
, {doNOP, 40 /* ( */, 20,0, TRUE} // 5
|
||||
, {doDotAny, 46 /* . */, 12,0, TRUE} // 6
|
||||
, {doCaret, 94 /* ^ */, 2,0, TRUE} // 7
|
||||
, {doDollar, 36 /* $ */, 2,0, TRUE} // 8
|
||||
, {doNOP, 92 /* \ */, 59,0, TRUE} // 9
|
||||
, {doPatFinish, 253, 2,0, FALSE} // 10
|
||||
, {doRuleError, 255, 79,0, FALSE} // 11
|
||||
, {doNOP, 42 /* * */, 48,0, TRUE} // 12 expr-quant
|
||||
, {doNOP, 43 /* + */, 51,0, TRUE} // 13
|
||||
, {doNOP, 63 /* ? */, 54,0, TRUE} // 14
|
||||
, {doNOP, 123 /* { */, 57,0, TRUE} // 15
|
||||
, {doNOP, 255, 17,0, FALSE} // 16
|
||||
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 17 expr-cont
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 18
|
||||
, {doNOP, 255, 2,0, FALSE} // 19
|
||||
, {doNOP, 63 /* ? */, 22,0, TRUE} // 20 open-paren
|
||||
, {doOpenCaptureParen, 255, 2, 12, FALSE} // 21
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 22 open-paren-extended
|
||||
, {doOpenAtomicParen, 62 /* > */, 2, 12, TRUE} // 23
|
||||
, {doOpenLookAhead, 61 /* = */, 2, 17, TRUE} // 24
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 2, 17, TRUE} // 25
|
||||
, {doNOP, 60 /* < */, 34,0, TRUE} // 26
|
||||
, {doNOP, 35 /* # */, 37,0, TRUE} // 27
|
||||
, {doMatchMode, 105 /* i */, 40,0, TRUE} // 28
|
||||
, {doMatchMode, 120 /* x */, 40,0, TRUE} // 29
|
||||
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 30
|
||||
, {doMatchMode, 109 /* m */, 40,0, TRUE} // 31
|
||||
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 32
|
||||
, {doBadOpenParenType, 255, 79,0, FALSE} // 33
|
||||
, {doOpenLookBehind, 61 /* = */, 2, 17, TRUE} // 34 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE} // 35
|
||||
, {doBadOpenParenType, 255, 79,0, FALSE} // 36
|
||||
, {doNOP, 41 /* ) */, 2,0, TRUE} // 37 paren-comment
|
||||
, {doMismatchedParenErr, 253, 79,0, FALSE} // 38
|
||||
, {doNOP, 255, 37,0, TRUE} // 39
|
||||
, {doMatchMode, 105 /* i */, 40,0, TRUE} // 40 paren-flag
|
||||
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 41
|
||||
, {doMatchMode, 109 /* m */, 40,0, TRUE} // 42
|
||||
, {doMatchMode, 120 /* x */, 40,0, TRUE} // 43
|
||||
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 44
|
||||
, {doNOP, 41 /* ) */, 2,0, TRUE} // 45
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 46
|
||||
, {doNOP, 255, 79,0, FALSE} // 47
|
||||
, {doNGStar, 63 /* ? */, 17,0, TRUE} // 48 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 17,0, TRUE} // 49
|
||||
, {doStar, 255, 17,0, FALSE} // 50
|
||||
, {doNGPlus, 63 /* ? */, 17,0, TRUE} // 51 quant-plus
|
||||
, {doPossesivePlus, 43 /* + */, 17,0, TRUE} // 52
|
||||
, {doPlus, 255, 17,0, FALSE} // 53
|
||||
, {doNGOpt, 63 /* ? */, 17,0, TRUE} // 54 quant-opt
|
||||
, {doPossesiveOpt, 43 /* + */, 17,0, TRUE} // 55
|
||||
, {doOpt, 255, 17,0, FALSE} // 56
|
||||
, {doNOP, 129, 57,0, TRUE} // 57 interval-open
|
||||
, {doNotImplementedError, 255, 79,0, FALSE} // 58
|
||||
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 59 backslash
|
||||
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 60
|
||||
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 61
|
||||
, {doBackslashd, 100 /* d */, 12,0, TRUE} // 62
|
||||
, {doBackslashD, 68 /* D */, 12,0, TRUE} // 63
|
||||
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 64
|
||||
, {doNamedChar, 78 /* N */, 12,0, TRUE} // 65
|
||||
, {doProperty, 112 /* p */, 12,0, FALSE} // 66
|
||||
, {doProperty, 80 /* P */, 12,0, FALSE} // 67
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 68
|
||||
, {doBackslashS, 83 /* S */, 12,0, TRUE} // 69
|
||||
, {doBackslashs, 115 /* s */, 12,0, TRUE} // 70
|
||||
, {doBackslashW, 87 /* W */, 12,0, TRUE} // 71
|
||||
, {doBackslashw, 119 /* w */, 12,0, TRUE} // 72
|
||||
, {doBackslashX, 88 /* X */, 12,0, TRUE} // 73
|
||||
, {doBackslashx, 120 /* x */, 12,0, TRUE} // 74
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 75
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 76
|
||||
, {doBackRef, 128, 12,0, TRUE} // 77
|
||||
, {doLiteralChar, 255, 12,0, TRUE} // 78
|
||||
, {doExit, 255, 79,0, TRUE} // 79 errorDeath
|
||||
};
|
||||
static const char *RegexStateNames[] = { 0,
|
||||
"start",
|
||||
"finish",
|
||||
"term",
|
||||
0,
|
||||
0,
|
||||
|
@ -194,13 +182,6 @@ static const char *RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"string",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"expr-quant",
|
||||
0,
|
||||
|
|
|
@ -55,50 +55,27 @@
|
|||
# start state, scan position is at the beginning of the pattern.
|
||||
#
|
||||
start:
|
||||
default term ^finish doPatStart
|
||||
default term doPatStart
|
||||
|
||||
#
|
||||
# finish - We've scanned off the end of the pattern string.
|
||||
# The "doPatFinish" action will stop the pattern scanning state machine.
|
||||
#
|
||||
finish:
|
||||
default finish doPatFinish
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
# term. Eat through a single rule character, or a composite thing, which
|
||||
# could be a parenthesized expression or a Unicode Set.
|
||||
# term. At a position where we can accept the start most items in a pattern.
|
||||
#
|
||||
term:
|
||||
quoted n string doStartString
|
||||
rule_char n string doStartString
|
||||
'[' n expr-quant doScanUnicodeSet
|
||||
'(' n open-paren ^expr-quant
|
||||
quoted n expr-quant doLiteralChar
|
||||
rule_char n expr-quant doLiteralChar
|
||||
'[' n expr-quant doScanUnicodeSet
|
||||
'(' n open-paren
|
||||
'.' n expr-quant doDotAny
|
||||
'^' n term doCaret
|
||||
'$' n term doDollar
|
||||
'\' n backslash
|
||||
eof finish
|
||||
eof term doPatFinish
|
||||
default errorDeath doRuleError
|
||||
|
||||
|
||||
#
|
||||
# string We've encountered a literal character, or an escaped character.
|
||||
# Continue with any additional literal chars, building the sequence
|
||||
# into a string.
|
||||
#
|
||||
string:
|
||||
quoted n string doStringChar
|
||||
rule_char n string doStringChar
|
||||
# If the string ends in a quatinfier, we need to split off the last character so that
|
||||
# the quantifier effects only it, and not the entire string. (e.g. "ABC*")
|
||||
'?' expr-quant doSplitString
|
||||
'+' expr-quant doSplitString
|
||||
'*' expr-quant doSplitString
|
||||
'{' expr-quant doSplitString
|
||||
default expr-quant doEndString
|
||||
|
||||
#
|
||||
# expr-quant We've just finished scanning a term, now look for the optional
|
||||
|
@ -223,12 +200,12 @@ backslash:
|
|||
'A' n term doBackslashA
|
||||
'B' n term doBackslashB
|
||||
'b' n term doBackslashb
|
||||
'd' n expr-quant doBackslashd
|
||||
'd' n expr-quant doBackslashd
|
||||
'D' n expr-quant doBackslashD
|
||||
'G' n term doBackslashG
|
||||
'N' n expr-quant doNamedChar # \N{NAME} named char
|
||||
'p' expr-quant doProperty # \p{Lu} style property
|
||||
'P' expr-quant doProperty
|
||||
'N' n expr-quant doNamedChar # \N{NAME} named char
|
||||
'p' expr-quant doProperty # \p{Lu} style property
|
||||
'P' expr-quant doProperty
|
||||
'Q' n term doEnterQuoteMode
|
||||
'S' n expr-quant doBackslashS
|
||||
's' n expr-quant doBackslashs
|
||||
|
@ -238,9 +215,8 @@ backslash:
|
|||
'x' n expr-quant doBackslashx
|
||||
'Z' n term doBackslashZ
|
||||
'z' n term doBackslashz
|
||||
digit_char n expr-quant doBackRef
|
||||
|
||||
default n string doStartString
|
||||
digit_char n expr-quant doBackRef
|
||||
default n expr-quant doLiteralChar # Escaped literal char.
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -14,36 +14,38 @@
|
|||
|
||||
|
||||
//
|
||||
// Opcode types In the compiled form of the regex, these are the type, or opcodes,
|
||||
// Opcode types In the compiled form of the regexp, these are the type, or opcodes,
|
||||
// of the entries.
|
||||
//
|
||||
static const uint32_t URX_UNUSED1 = 1;
|
||||
static const uint32_t URX_END = 2;
|
||||
static const uint32_t URX_ONECHAR = 3; // Value field is the 21 bit unicode char to match
|
||||
static const uint32_t URX_STRING = 4; // Value field is index of string start
|
||||
static const uint32_t URX_STRING_LEN = 5; // Value field is string length (code units)
|
||||
static const uint32_t URX_STATE_SAVE = 6; // Value field is pattern position to push
|
||||
static const uint32_t URX_NOP = 7;
|
||||
static const uint32_t URX_START_CAPTURE = 8; // Value field is capture group number.
|
||||
static const uint32_t URX_END_CAPTURE = 9; // Value field is capture group number
|
||||
static const uint32_t URX_STATIC_SETREF = 10; // Value field is index of set in array of sets.
|
||||
static const uint32_t URX_SETREF = 11; // Value field is index of set in array of sets.
|
||||
static const uint32_t URX_DOTANY = 12;
|
||||
static const uint32_t URX_JMP = 13; // Value field is destination position in
|
||||
enum {
|
||||
URX_UNUSED1 = 1,
|
||||
URX_END = 2,
|
||||
URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
|
||||
URX_STRING = 4, // Value field is index of string start
|
||||
URX_STRING_LEN = 5, // Value field is string length (code units)
|
||||
URX_STATE_SAVE = 6, // Value field is pattern position to push
|
||||
URX_NOP = 7,
|
||||
URX_START_CAPTURE = 8, // Value field is capture group number.
|
||||
URX_END_CAPTURE = 9, // Value field is capture group number
|
||||
URX_STATIC_SETREF = 10, // Value field is index of set in array of sets.
|
||||
URX_SETREF = 11, // Value field is index of set in array of sets.
|
||||
URX_DOTANY = 12,
|
||||
URX_JMP = 13, // Value field is destination position in
|
||||
// the pattern.
|
||||
static const uint32_t URX_FAIL = 14; // Stop match operation; No match.
|
||||
URX_FAIL = 14, // Stop match operation, No match.
|
||||
|
||||
static const uint32_t URX_BACKSLASH_A = 15;
|
||||
static const uint32_t URX_BACKSLASH_B = 16; // Value field: 0: \b 1: \B
|
||||
static const uint32_t URX_BACKSLASH_G = 17;
|
||||
static const uint32_t URX_BACKSLASH_W = 18; // Value field: 0: \w 1: \W
|
||||
static const uint32_t URX_BACKSLASH_X = 19;
|
||||
static const uint32_t URX_BACKSLASH_Z = 20; // \z Unconditional end of line.
|
||||
URX_BACKSLASH_A = 15,
|
||||
URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B
|
||||
URX_BACKSLASH_G = 17,
|
||||
URX_BACKSLASH_W = 18, // Value field: 0: \w 1: \W
|
||||
URX_BACKSLASH_X = 19,
|
||||
URX_BACKSLASH_Z = 20, // \z Unconditional end of line.
|
||||
|
||||
static const uint32_t URX_DOTANY_ALL = 21; // ., in the . matches any mode.
|
||||
static const uint32_t URX_BACKSLASH_D = 22; // Value field: 0: \d 1: \D
|
||||
static const uint32_t URX_CARET = 23; // Value field: 1: multi-line mode.
|
||||
static const uint32_t URX_DOLLAR = 24; // Also for \Z
|
||||
URX_DOTANY_ALL = 21, // ., in the . matches any mode.
|
||||
URX_BACKSLASH_D = 22, // Value field: 0: \d 1: \D
|
||||
URX_CARET = 23, // Value field: 1: multi-line mode.
|
||||
URX_DOLLAR = 24 // Also for \Z
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
|
@ -58,13 +60,16 @@ static const uint32_t URX_DOLLAR = 24; // Also for \Z
|
|||
// Access to Unicode Sets for Perl-like composite character properties
|
||||
// The sets are accessed by the match engine for things like \w (word boundary)
|
||||
//
|
||||
static const uint32_t URX_ISWORD_SET = 1;
|
||||
static const uint32_t URX_ISALNUM_SET = 2;
|
||||
static const uint32_t URX_ISALPHA_SET = 3;
|
||||
static const uint32_t URX_ISSPACE_SET = 4;
|
||||
static const uint32_t URX_LAST_SET = 5;
|
||||
enum {
|
||||
URX_ISWORD_SET = 1,
|
||||
URX_ISALNUM_SET = 2,
|
||||
URX_ISALPHA_SET = 3,
|
||||
URX_ISSPACE_SET = 4,
|
||||
URX_LAST_SET = 5,
|
||||
|
||||
URX_NEG_SET = 0x800000 // Flag bit to reverse sense of set
|
||||
// membership test.
|
||||
};
|
||||
|
||||
static const uint32_t URX_NEG_SET = 0x800000; // Flag bit to reverse sense of set
|
||||
// membership test.
|
||||
#endif
|
||||
|
||||
|
|
|
@ -204,6 +204,11 @@ RegexPattern *RegexPattern::compile(
|
|||
if (U_FAILURE(err)) {
|
||||
return NULL;
|
||||
}
|
||||
if (flags != 0) {
|
||||
err = U_REGEX_UNIMPLEMENTED;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
RegexPattern *This = new RegexPattern;
|
||||
if (This == NULL) {
|
||||
err = U_MEMORY_ALLOCATION_ERROR;
|
||||
|
|
|
@ -4,10 +4,35 @@
|
|||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
//
|
||||
// file: regex.h
|
||||
//
|
||||
// ICU Regular Expressions, API for C++
|
||||
//
|
||||
|
||||
#ifndef REGEX_H
|
||||
#define REGEX_H
|
||||
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C++ API: Regular Expressions
|
||||
*
|
||||
* <h2>Regular Expression API</h2>
|
||||
*
|
||||
* <p>The ICU API for processing regular expressions consists of two classes,
|
||||
* <code>RegexPattern</code> and <code>RegexMatcher</code>.
|
||||
* <code>RegexPattern</code> objects represent a pre-processed, or compiled
|
||||
* regular expression. They are created from a regular expression pattern string,
|
||||
* and can be used to create <RegexMatcher> objects for the pattern. </p>
|
||||
*
|
||||
* <p> Class <code>RegexMatcher</code> bundles together a regular expression pattern
|
||||
* and a target string to which the search pattern will be applied.
|
||||
* <code>RegexMatcher</code> includes API for doing plain find or search
|
||||
* operations, for search and replace operations, and for obtaining detailed
|
||||
* information about bounds of a match. </p>
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
@ -25,56 +50,120 @@ class UStack;
|
|||
class UnicodeSet;
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------
|
||||
//
|
||||
// Flags for Regular Expression Modes.
|
||||
// TODO: Move to C header once one exists.
|
||||
// All flags default to off or false
|
||||
// All are as defined by Java Regexps.
|
||||
//
|
||||
//---------------------------------------------------------------------------------
|
||||
/**
|
||||
* Constants for Regular Expression Match Modes.
|
||||
* <p>Note that non-default match modes will not be supported until ICU 2.6</p>
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
enum {
|
||||
UREGEX_CANON_EQ = 128, // Forces normalization of pattern and strings.
|
||||
UREGEX_CASE_INSENSITIVE = 2, // Enable case insensitive matching.
|
||||
UREGEX_COMMENTS = 4, // Allow white space and comments within patterns
|
||||
UREGEX_DOTALL = 32, // If set, "." matches line terminators.
|
||||
// otherwise . matching stops at line end.
|
||||
UREGEX_MULTILINE = 8, // Control behavior of "$" and "^".
|
||||
// If set, recognize line terminators within string
|
||||
// otherwise, match only at start and end of
|
||||
// input string
|
||||
UREGEX_UNICODE_CASE = 64, // If set, use full Unicode case folding for case
|
||||
// insensitive matches. Otherwise, case insensitive
|
||||
// matching only affects chars in the ASCII range.
|
||||
// TODO: do we want to support this option at all?
|
||||
UREGEX_UNIX_LINES = 1 // If set, only \n is recognized as a line terminator.
|
||||
// otherwise recognize all Unicode line endings.
|
||||
/** Forces normalization of pattern and strings. @draft ICU 2.4 */
|
||||
UREGEX_CANON_EQ = 128,
|
||||
/** Enable case insensitive matching. @draft ICU 2.4 */
|
||||
UREGEX_CASE_INSENSITIVE = 2,
|
||||
/** Allow white space and comments within patterns @draft ICU 2.4 */
|
||||
UREGEX_COMMENTS = 4,
|
||||
/** If set, '.' matches line terminators, otherwise '.' matching stops at line end.
|
||||
* @draft ICU 2.4 */
|
||||
UREGEX_DOTALL = 32,
|
||||
/** Control behavior of "$" and "^"
|
||||
* If set, recognize line terminators within string,
|
||||
* otherwise, match only at start and end of input string.
|
||||
* @draft ICU 2.4 */
|
||||
UREGEX_MULTILINE = 8
|
||||
};
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------
|
||||
//
|
||||
// class RegexPattern
|
||||
//
|
||||
//---------------------------------------------------------------------------------
|
||||
/**
|
||||
* Class <code>RegexPattern</code> represents a compiled regular expression. It includes
|
||||
* factory methods for creating a RegexPattern object from the source (string) form
|
||||
* of a regular expression, methods for creating RegexMatchers that allow the pattern
|
||||
* to be applied to input text, and a few convenience methods for simple common
|
||||
* uses of regular expressions.
|
||||
*
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
class U_I18N_API RegexPattern: public UObject {
|
||||
public:
|
||||
|
||||
|
||||
/**
|
||||
* default constructor. Create a RegexPattern object that refers to no actual
|
||||
* pattern. Not normally needed; RegexPattern objects are usually
|
||||
* created using the factory method <code>compile()</code.
|
||||
*
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
RegexPattern();
|
||||
RegexPattern(const RegexPattern &other);
|
||||
|
||||
|
||||
/**
|
||||
* Copy Constructor. Create a new RegexPattern object that is equivalent
|
||||
* to the source object.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
RegexPattern(const RegexPattern &source);
|
||||
|
||||
/**
|
||||
* Destructor. Note that a RegexPattern object must persist so long as any
|
||||
* RegexMatcher objects that were created from the RegexPattern are active.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual ~RegexPattern();
|
||||
|
||||
/**
|
||||
* Comparison operator. Two RegexPattern objects are considered equal if they
|
||||
* were constructed from identical source patterns using the same match flag
|
||||
* settings.
|
||||
* @param that a RegexPattern object to compare with "this".
|
||||
* @return TRUE if the objects are equavelent.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
UBool operator==(const RegexPattern& that) const;
|
||||
|
||||
/**
|
||||
* Comparison operator. Two RegexPattern objects are considered equal if they
|
||||
* were constructed from identical source patterns using the same match flag
|
||||
* settings.
|
||||
* @param that a RegexPattern object to compare with "this".
|
||||
* @return TRUE if the objects are different.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);};
|
||||
|
||||
RegexPattern &operator =(const RegexPattern &other);
|
||||
/*
|
||||
* Assignment operator. After assignment, this RegexPattern will behave identically
|
||||
* to the source object.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
RegexPattern &operator =(const RegexPattern &source);
|
||||
|
||||
/*
|
||||
* Create an exact copy of this RegexPattern object. Since RegexPattern is not
|
||||
* intended to be subclasses, <code>clone()</code> and the copy construction are
|
||||
* equivalent operations.
|
||||
*/
|
||||
virtual RegexPattern *clone() const;
|
||||
|
||||
|
||||
/**
|
||||
* Compiles the given regular expression into a pattern
|
||||
* <p>Compiles the given regular expression in string form into a RegexPattern
|
||||
* object. The compile methods, rather than the constructors, are the usual
|
||||
* way that RegexPattern objects are created.</p>
|
||||
*
|
||||
* <p>Note that RegexPattern objects must not be deleted while RegexMatcher
|
||||
* objects created from the pattern are active. RegexMatchers keep a pointer
|
||||
* back to their pattern, so premature deletion of the pattern is a
|
||||
* catastrophic error.</p>
|
||||
*
|
||||
* <p>All pattern match mode flags are set to their default values.</p>
|
||||
*
|
||||
* @param regex The regular expression to be compiles.
|
||||
* @param pe Receives the position (line and column nubers) of any error
|
||||
* within the regular expression.)
|
||||
* @param err A reference to a UErrorCode to receive any errors.
|
||||
* @return A regexPattern object for the compiled pattern.
|
||||
*
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
static RegexPattern *compile( const UnicodeString ®ex,
|
||||
UParseError &pe,
|
||||
|
@ -83,6 +172,26 @@ public:
|
|||
/**
|
||||
* Compiles the given regular expression into a pattern with the given flags
|
||||
*/
|
||||
/**
|
||||
* <p>Compiles the given regular expression in string form into a RegexPattern
|
||||
* object using the specified match mode flags. The compile methods,
|
||||
* rather than the constructors, are the usual way that RegexPattern objects
|
||||
* are created.</p>
|
||||
*
|
||||
* <p>Note that RegexPattern objects must not be deleted while RegexMatcher
|
||||
* objects created from the pattern are active. RegexMatchers keep a pointer
|
||||
* back to their pattern, so premature deletion of the pattern is a
|
||||
* catastrophic error.</p>
|
||||
*
|
||||
* @param regex The regular expression to be compiles.
|
||||
* @param flags The match mode flags to be used.
|
||||
* @param pe Receives the position (line and column nubers) of any error
|
||||
* within the regular expression.)
|
||||
* @param err A reference to a UErrorCode to receive any errors.
|
||||
* @return A regexPattern object for the compiled pattern.
|
||||
*
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
static RegexPattern *compile( const UnicodeString ®ex,
|
||||
int32_t flags,
|
||||
UParseError &pe,
|
||||
|
@ -90,19 +199,41 @@ public:
|
|||
|
||||
|
||||
/**
|
||||
* Return the flags for this pattern
|
||||
* Get the match mode flags that were used when compiling this pattern.
|
||||
* @return the match mode flags
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t flags() const;
|
||||
|
||||
/*
|
||||
* Creates a matcher that will match the given input against this pattern.
|
||||
* Creates a RegexMatcher that will match the given input against this pattern. The
|
||||
* RegexMatcher can then be used to perform match, find or replace operations
|
||||
* on on the input. Note that a RegexPattern object must not be deleted while
|
||||
* any RegexMatchers created from it still exist and might possibly be used again.
|
||||
*
|
||||
* @param input The input string to which the regular expression will be applied.
|
||||
* @param err A reference to a UErrorCode to receive any errors.
|
||||
* @return A RegexMatcher object for this pattern and input.
|
||||
*
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual RegexMatcher *matcher(const UnicodeString &input,
|
||||
UErrorCode &err) const;
|
||||
|
||||
|
||||
/*
|
||||
* Compiles the given regular expression and attempts to match the given input against it.
|
||||
/**
|
||||
* Test whether a string matches a regular expression. This convenience function
|
||||
* both compiles the reguluar expression and applies it in a single operation.
|
||||
* Note that if the same pattern needs to be applied repeatedly, this method will be
|
||||
* less efficient than creating and reusing RegexPattern object.
|
||||
*
|
||||
* @param regex The regular expression
|
||||
* @param input The string data to be matched
|
||||
* @param pe Receives the position of any syntax errors within the regular expression
|
||||
* @param err A reference to a UErrorCode to receive any errors.
|
||||
* @return True if the regular expression exactly matches the full input string.
|
||||
*
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
static UBool matches(const UnicodeString ®ex,
|
||||
const UnicodeString &input,
|
||||
|
@ -112,12 +243,13 @@ public:
|
|||
|
||||
/*
|
||||
* Returns the regular expression from which this pattern was compiled.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UnicodeString pattern() const;
|
||||
|
||||
|
||||
/*
|
||||
* Split a string around matches of the pattern. Somewhat like split() form Perl.
|
||||
* Split a string around matches of the pattern. Somewhat like split() from Perl.
|
||||
* @param input The string to be split into fields. The field delimiters
|
||||
* match the pattern (in the "this" object)
|
||||
* @param dest An array of UnicodeStrings to receive the results of the split.
|
||||
|
@ -131,6 +263,7 @@ public:
|
|||
* of fields, the trailing part of the input string, including any
|
||||
* field delimiters, is placed in the last destination string.
|
||||
* @return The number of fields into which the input string was split.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t split(const UnicodeString &input,
|
||||
UnicodeString dest[],
|
||||
|
@ -147,14 +280,14 @@ public:
|
|||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||
*
|
||||
* @draft ICU 2.2
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for this class.
|
||||
*
|
||||
* @draft ICU 2.2
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
||||
|
||||
|
@ -167,12 +300,12 @@ private:
|
|||
UnicodeString fPattern; // The original pattern string.
|
||||
int32_t fFlags; // The flags used when compiling the pattern.
|
||||
//
|
||||
UVector *fCompiledPat; // The compiled, tokenized pattern.
|
||||
UVector *fCompiledPat; // The compiled pattern.
|
||||
UnicodeString fLiteralText; // Any literal string data from the pattern,
|
||||
// after un-escaping, for use during the match.
|
||||
UVector *fSets; // Any UnicodeSets referenced from the pattern.
|
||||
UBool fBadState; // True if any prior error has left this
|
||||
// RegexPattern unusable.
|
||||
UBool fBadState; // True if some prior error has left this
|
||||
// RegexPattern in an unusable state.
|
||||
|
||||
RegexMatcher *fMatcher; // A cached matcher for this pattern, used for
|
||||
// split(), to avoid having to
|
||||
|
@ -205,61 +338,77 @@ private:
|
|||
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// class RegexMatcher
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
class U_I18N_API RegexMatcher: public UObject {
|
||||
/**
|
||||
* class RegexMatcher bundles together a reular expression pattern and
|
||||
* input text to which the expression can be applied. It includes methods
|
||||
* for testing for matches, and for find and replace operations.
|
||||
*
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
class U_I18N_API RegexMatcher: public UObject {
|
||||
public:
|
||||
|
||||
/* Destructor. Note that there are no public constructors; creation is
|
||||
/**
|
||||
* Destructor. Note that there are no public constructors; creation is
|
||||
* done with RegexPattern::matcher().
|
||||
*
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual ~RegexMatcher();
|
||||
|
||||
/*
|
||||
/**
|
||||
* Implements a replace operation intended to be used as part of an
|
||||
* incremental find-and-replace.
|
||||
*
|
||||
* The input sequence, starting from the append position and ending at
|
||||
* the start of the current match is appended to the destination string.
|
||||
* The input string, starting from the end of the previous match and ending at
|
||||
* the start of the current match, is appended to the destination string.
|
||||
*
|
||||
* Then the replacement string is appended to the output string,
|
||||
* including handling any substitutions of captured text.
|
||||
*
|
||||
* The append position is set to the position of the first
|
||||
* character following the match in the input string.
|
||||
*
|
||||
* For complete, prepackaged, non-incremental find-and-replace
|
||||
* For simple, prepackaged, non-incremental find-and-replace
|
||||
* operations, see replaceFirst() or replaceAll().
|
||||
*
|
||||
* Returns: This Matcher
|
||||
* @param dest A UnicodeString to which the results of the find-and-replace are appended.
|
||||
* @param replacement A UnicodeString that provides the text to be substitured for
|
||||
* the input text that matched the regexp pattern. The replacement
|
||||
* text may contain references to captured text from the
|
||||
* input.
|
||||
* @param status A reference to a UErrorCode to receive any errors. Possible
|
||||
* errors are U_REGEX_INVALID_STATE if no match has been
|
||||
* attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
|
||||
* if the replacement text specifies a capture group that
|
||||
* does not exist in the pattern.
|
||||
*
|
||||
* @return this RegexMatcher
|
||||
* @draft ICU 2.4
|
||||
*
|
||||
* error: Illegal state - no match yet attemtped, or last match failed.
|
||||
* IndexOutOfBounds - caputure string number from replacement string.
|
||||
*/
|
||||
virtual RegexMatcher &appendReplacement(UnicodeString &dest,
|
||||
const UnicodeString &replacement, UErrorCode &status);
|
||||
|
||||
|
||||
/*
|
||||
* This method reads characters from the input sequence,
|
||||
* starting at the append position, and appends them to the
|
||||
* destination string. It is intended to be invoked after one
|
||||
* or more invocations of the appendReplacement method in order
|
||||
* to copy the remainder of the input sequence.
|
||||
/**
|
||||
* As the final step in a find-and-replace operation, append the remainder
|
||||
* of the input string, starting at the position following the last match,
|
||||
* to the destination string. It is intended to be invoked after one
|
||||
* or more invocations of the <code>RegexMatcher::appendReplacement()</code>.
|
||||
*
|
||||
* @param dest A UnicodeString to which the results of the find-and-replace are appended.
|
||||
* @return the destination string.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UnicodeString &appendTail(UnicodeString &dest);
|
||||
|
||||
|
||||
/*
|
||||
* Returns the index of the last character matched, plus one.
|
||||
* error: Illegal state - no match yet attemtped, or last match failed.
|
||||
/**
|
||||
* Find the ending position of the most recent match.
|
||||
* @param status A reference to a UErrorCode to receive any errors. Possible
|
||||
* errors are U_REGEX_INVALID_STATE if no match has been
|
||||
* attempted or the last match failed.
|
||||
* @return the index of the last character matched, plus one.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t end(UErrorCode &err) const;
|
||||
virtual int32_t end(UErrorCode &status) const;
|
||||
|
||||
|
||||
/*
|
||||
|
|
|
@ -367,7 +367,7 @@ void RegexTest::Basic() {
|
|||
//
|
||||
#if 0
|
||||
{
|
||||
REGEX_FIND("\\D+", "<0>non digits</0>");
|
||||
REGEX_FIND("(?:ABC)+", "<0>ABCABCABC</0>D");
|
||||
}
|
||||
exit(1);
|
||||
#endif
|
||||
|
@ -856,17 +856,21 @@ void RegexTest::API_Pattern() {
|
|||
RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
|
||||
REGEX_ASSERT(*pat1a == *pat1);
|
||||
|
||||
#if 0
|
||||
// Compile with different flags should be not equal
|
||||
RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
|
||||
REGEX_ASSERT(*pat1b != *pat1a);
|
||||
REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
|
||||
REGEX_ASSERT(pat1a->flags() == 0);
|
||||
delete pat1b;
|
||||
#endif // add test back in when we actually support flag settings.
|
||||
|
||||
// clone
|
||||
RegexPattern *pat1c = pat1b->clone();
|
||||
REGEX_ASSERT(*pat1b == *pat1c);
|
||||
REGEX_ASSERT(*pat1a != *pat1c);
|
||||
RegexPattern *pat1c = pat1->clone();
|
||||
REGEX_ASSERT(*pat1c == *pat1);
|
||||
REGEX_ASSERT(*pat1c != *pat2);
|
||||
|
||||
|
||||
// TODO: Actually do some matches with the cloned/copied/assigned patterns.
|
||||
|
@ -874,7 +878,6 @@ void RegexTest::API_Pattern() {
|
|||
|
||||
|
||||
delete pat1c;
|
||||
delete pat1b;
|
||||
delete pat1a;
|
||||
delete pat1;
|
||||
delete pat2;
|
||||
|
@ -1081,6 +1084,18 @@ void RegexTest::Extended() {
|
|||
// (?# comment) doesn't muck up pattern
|
||||
REGEX_FIND("Hello (?# this is a comment) world", " <0>Hello world</0>...");
|
||||
|
||||
// Check some implementation corner cases base on the way literal strings are compiled.
|
||||
REGEX_FIND("A", "<0>A</0>");
|
||||
REGEX_FIND("AB", "<0>AB</0>ABABAB");
|
||||
REGEX_FIND("AB+", "<0>ABBB</0>A");
|
||||
REGEX_FIND("AB+", "<0>AB</0>ABAB");
|
||||
REGEX_FIND("ABC+", "<0>ABC</0>ABC");
|
||||
REGEX_FIND("ABC+", "<0>ABCCCC</0>ABC");
|
||||
REGEX_FIND("(?:ABC)+", "<0>ABCABCABC</0>D");
|
||||
REGEX_FIND("(?:ABC)DEF+", "<0>ABCDEFFF</0>D");
|
||||
REGEX_FIND("AB\\.C\\eD\\u0666E", "<0>AB.C\\u001BD\\u0666E</0>F");
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -1123,6 +1138,18 @@ void RegexTest::Errors() {
|
|||
// {Numeric Quantifiers}
|
||||
REGEX_ERR("abc{4}", 1, 5, U_REGEX_UNIMPLEMENTED);
|
||||
|
||||
// Attempt to use non-default flags
|
||||
{
|
||||
UParseError pe;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t flags = UREGEX_CASE_INSENSITIVE | UREGEX_CANON_EQ |
|
||||
UREGEX_COMMENTS | UREGEX_DOTALL |
|
||||
UREGEX_MULTILINE;
|
||||
RegexPattern *pat1= RegexPattern::compile(".*", UREGEX_CASE_INSENSITIVE, pe, status);
|
||||
REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
|
||||
delete pat1;
|
||||
}
|
||||
|
||||
|
||||
// Quantifiers are allowed only after something that can be quantified.
|
||||
REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
|
||||
|
|
Loading…
Add table
Reference in a new issue