ICU-6149 Merge regexp resource limit functions into the trunk

X-SVN-Rev: 23467
This commit is contained in:
Andy Heninger 2008-02-20 03:08:44 +00:00
parent 56c21905a4
commit 5e550c04ac
11 changed files with 1048 additions and 157 deletions

View file

@ -729,6 +729,9 @@ typedef enum UErrorCode {
U_REGEX_OCTAL_TOO_BIG, /**< Octal character constants must be <= 0377. */
U_REGEX_MISSING_CLOSE_BRACKET, /**< Missing closing bracket on a bracket expression. */
U_REGEX_INVALID_RANGE, /**< In a character range [x-y], x is greater than y. */
U_REGEX_STACK_OVERFLOW, /**< Regular expression backtrack stack overflow. */
U_REGEX_TIME_OUT, /**< Maximum allowed match time exceeded */
U_REGEX_STOPPED_BY_CALLER, /**< Matching operation aborted by user callback fn. */
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
/*

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 1997-2007, International Business Machines
* Copyright (C) 1997-2008, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -156,7 +156,10 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
"U_REGEX_SET_CONTAINS_STRING",
"U_REGEX_OCTAL_TOO_BIG",
"U_REGEX_MISSING_CLOSE_BRACKET",
"U_REGEX_INVALID_RANGE"
"U_REGEX_INVALID_RANGE",
"U_REGEX_STACK_OVERFLOW",
"U_REGEX_TIME_OUT",
"U_REGEX_STOPPED_BY_CALLER"
};
static const char * const

View file

@ -1,6 +1,6 @@
/*
******************************************************************************
* Copyright (C) 1999-2003, International Business Machines Corporation and *
* Copyright (C) 1999-2008, International Business Machines Corporation and *
* others. All Rights Reserved. *
******************************************************************************
* Date Name Description
@ -26,6 +26,7 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UVector32)
UVector32::UVector32(UErrorCode &status) :
count(0),
capacity(0),
maxCapacity(0),
elements(NULL)
{
_init(DEFUALT_CAPACITY, status);
@ -34,6 +35,7 @@ UVector32::UVector32(UErrorCode &status) :
UVector32::UVector32(int32_t initialCapacity, UErrorCode &status) :
count(0),
capacity(0),
maxCapacity(0),
elements(0)
{
_init(initialCapacity, status);
@ -46,6 +48,9 @@ void UVector32::_init(int32_t initialCapacity, UErrorCode &status) {
if (initialCapacity < 1) {
initialCapacity = DEFUALT_CAPACITY;
}
if (maxCapacity>0 && maxCapacity<initialCapacity) {
initialCapacity = maxCapacity;
}
elements = (int32_t *)uprv_malloc(sizeof(int32_t)*initialCapacity);
if (elements == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
@ -187,23 +192,57 @@ int32_t UVector32::indexOf(int32_t key, int32_t startIndex) const {
UBool UVector32::expandCapacity(int32_t minimumCapacity, UErrorCode &status) {
if (capacity < minimumCapacity) {
int32_t newCap = capacity * 2;
if (newCap < minimumCapacity) {
newCap = minimumCapacity;
}
int32_t* newElems = (int32_t *)uprv_realloc(elements, sizeof(int32_t)*newCap);
if (newElems == NULL) {
// We keep the original contents on the memory failure on realloc.
status = U_MEMORY_ALLOCATION_ERROR;
return FALSE;
}
elements = newElems;
capacity = newCap;
if (capacity >= minimumCapacity) {
return TRUE;
}
if (maxCapacity>0 && minimumCapacity>maxCapacity) {
status = U_BUFFER_OVERFLOW_ERROR;
return FALSE;
}
int32_t newCap = capacity * 2;
if (newCap < minimumCapacity) {
newCap = minimumCapacity;
}
if (maxCapacity > 0 && newCap > maxCapacity) {
newCap = maxCapacity;
}
int32_t* newElems = (int32_t *)uprv_realloc(elements, sizeof(int32_t)*newCap);
if (newElems == NULL) {
// We keep the original contents on the memory failure on realloc.
status = U_MEMORY_ALLOCATION_ERROR;
return FALSE;
}
elements = newElems;
capacity = newCap;
return TRUE;
}
void UVector32::setMaxCapacity(int32_t limit) {
U_ASSERT(limit >= 0);
maxCapacity = limit;
if (maxCapacity < 0) {
maxCapacity = 0;
}
if (capacity <= maxCapacity || maxCapacity == 0) {
// Current capacity is within the new limit.
return;
}
// New maximum capacity is smaller than the current size.
// Realloc the storage to the new, smaller size.
int32_t* newElems = (int32_t *)uprv_realloc(elements, sizeof(int32_t)*maxCapacity);
if (newElems == NULL) {
// Realloc to smaller failed.
// Just keep what we had. No need to call it a failure.
return;
}
elements = newElems;
capacity = maxCapacity;
if (count > capacity) {
count = capacity;
}
}
/**
* Change the size of this vector as follows: If newSize is smaller,
* then truncate the array, possibly deleting held elements for i >=

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2006, International Business Machines
* Copyright (C) 1999-2008, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -61,6 +61,8 @@ private:
int32_t count;
int32_t capacity;
int32_t maxCapacity; // Limit beyond which capacity is not permitted to grow.
int32_t* elements;
@ -161,6 +163,14 @@ public:
*/
int32_t *getBuffer() const;
/**
* Set the maximum allowed buffer capacity for this vector/stack.
* Default with no limit set is unlimited, go until malloc() fails.
* A Limit of zero means unlimited capacity.
* Units are vector elements (32 bits each), not bytes.
*/
void setMaxCapacity(int32_t limit);
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*/
@ -221,7 +231,9 @@ inline void UVector32::addElement(int32_t elem, UErrorCode &status) {
}
inline int32_t *UVector32::reserveBlock(int32_t size, UErrorCode &status) {
ensureCapacity(count+size, status);
if (ensureCapacity(count+size, status) == FALSE) {
return NULL;
}
int32_t *rp = elements+count;
count += size;
return rp;

View file

@ -1,6 +1,6 @@
/*
**************************************************************************
* Copyright (C) 2002-2007 International Business Machines Corporation *
* Copyright (C) 2002-2008 International Business Machines Corporation *
* and others. All rights reserved. *
**************************************************************************
*/
@ -30,90 +30,69 @@
U_NAMESPACE_BEGIN
// Default limit for the size of the back track stack, to avoid system
// failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
// This value puts ICU's limits higher than most other regexp implementations,
// which use recursion rather than the heap, and take more storage per
// backtrack point.
//
static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
// Time limit counter constant.
// Time limits for expression evaluation are in terms of quanta of work by
// the engine, each of which is 10,000 state saves.
// This constant determines that state saves per tick number.
static const int32_t TIMER_INITIAL_VALUE = 10000;
//-----------------------------------------------------------------------------
//
// Constructor and Destructor
//
//-----------------------------------------------------------------------------
RegexMatcher::RegexMatcher(const RegexPattern *pat) {
fPattern = pat;
fPatternOwned = NULL;
fInput = NULL;
fTraceDebug = FALSE;
fDeferredStatus = U_ZERO_ERROR;
fStack = new UVector32(fDeferredStatus);
fData = fSmallData;
fWordBreakItr = NULL;
fTransparentBounds = FALSE;
fAnchoringBounds = TRUE;
fDeferredStatus = U_ZERO_ERROR;
init(fDeferredStatus);
if (U_FAILURE(fDeferredStatus)) {
return;
}
if (pat==NULL) {
fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (pat->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t));
}
if (fStack == NULL || fData == NULL) {
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
}
reset(RegexStaticSets::gStaticSets->fEmptyString);
fPattern = pat;
init2(RegexStaticSets::gStaticSets->fEmptyString, fDeferredStatus);
}
RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
uint32_t flags, UErrorCode &status) {
UParseError pe;
fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
fPattern = fPatternOwned;
fTraceDebug = FALSE;
fDeferredStatus = U_ZERO_ERROR;
fStack = new UVector32(status);
fData = fSmallData;
fWordBreakItr = NULL;
fTransparentBounds = FALSE;
fAnchoringBounds = TRUE;
init(status);
if (U_FAILURE(status)) {
return;
}
if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));
}
if (fStack == NULL || fData == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
reset(input);
UParseError pe;
fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
fPattern = fPatternOwned;
init2(input, status);
}
RegexMatcher::RegexMatcher(const UnicodeString &regexp,
uint32_t flags, UErrorCode &status) {
UParseError pe;
fTraceDebug = FALSE;
fDeferredStatus = U_ZERO_ERROR;
fStack = new UVector32(status);
fData = fSmallData;
fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
fPattern = fPatternOwned;
fWordBreakItr = NULL;
fTransparentBounds = FALSE;
fAnchoringBounds = TRUE;
init(status);
if (U_FAILURE(status)) {
return;
}
if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));
}
if (fStack == NULL || fData == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
reset(RegexStaticSets::gStaticSets->fEmptyString);
UParseError pe;
fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
fPattern = fPatternOwned;
init2(RegexStaticSets::gStaticSets->fEmptyString, status);
}
RegexMatcher::~RegexMatcher() {
delete fStack;
if (fData != fSmallData) {
@ -130,6 +109,79 @@ RegexMatcher::~RegexMatcher() {
#endif
}
//
// init() common initialization for use by all constructors.
// Initialize all fields, get the object into a consistent state.
// This must be done even when the initial status shows an error,
// so that the object is initialized sufficiently well for the destructor
// to run safely.
//
void RegexMatcher::init(UErrorCode &status) {
fPattern = NULL;
fPatternOwned = NULL;
fInput = NULL;
fFrameSize = 0;
fRegionStart = 0;
fRegionLimit = 0;
fAnchorStart = 0;
fAnchorLimit = 0;
fLookStart = 0;
fLookLimit = 0;
fActiveStart = 0;
fActiveLimit = 0;
fTransparentBounds = FALSE;
fAnchoringBounds = TRUE;
fMatch = FALSE;
fMatchStart = 0;
fMatchEnd = 0;
fLastMatchEnd = -1;
fAppendPosition = 0;
fHitEnd = FALSE;
fRequireEnd = FALSE;
fStack = NULL;
fFrame = NULL;
fTimeLimit = 0;
fTime = 0;
fTickCounter = 0;
fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
fCallbackFn = NULL;
fCallbackContext = NULL;
fTraceDebug = FALSE;
fDeferredStatus = status;
fData = fSmallData;
fWordBreakItr = NULL;
fStack = new UVector32(status);
if (U_FAILURE(status)) {
fDeferredStatus = status;
}
}
//
// init2() Common initialization for use by RegexMatcher constructors, part 2.
// This handles the common setup to be done after the Pattern is available.
//
void RegexMatcher::init2(const UnicodeString &input, UErrorCode &status) {
if (U_FAILURE(status)) {
fDeferredStatus = status;
return;
}
if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));
if (fData == NULL) {
status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
return;
}
}
reset(input);
setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
if (U_FAILURE(status)) {
fDeferredStatus = status;
return;
}
}
static const UChar BACKSLASH = 0x5c;
@ -856,6 +908,8 @@ void RegexMatcher::resetPreserveRegion() {
fMatch = FALSE;
fHitEnd = FALSE;
fRequireEnd = FALSE;
fTime = 0;
fTickCounter = TIMER_INITIAL_VALUE;
resetStack();
}
@ -1067,6 +1121,118 @@ RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
return *this;
}
//--------------------------------------------------------------------------------
//
// setTimeLimit
//
//--------------------------------------------------------------------------------
void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return;
}
if (limit < 0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
fTimeLimit = limit;
}
//--------------------------------------------------------------------------------
//
// getTimeLimit
//
//--------------------------------------------------------------------------------
int32_t RegexMatcher::getTimeLimit() const {
return fTimeLimit;
}
//--------------------------------------------------------------------------------
//
// setStackLimit
//
//--------------------------------------------------------------------------------
void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return;
}
if (limit < 0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
// Reset the matcher. This is needed here in case there is a current match
// whose final stack frame (containing the match results, pointed to by fFrame)
// would be lost by resizing to a smaller stack size.
reset();
if (limit == 0) {
// Unlimited stack expansion
fStack->setMaxCapacity(0);
} else {
// Change the units of the limit from bytes to ints, and bump the size up
// to be big enough to hold at least one stack frame for the pattern,
// if it isn't there already.
int32_t adjustedLimit = limit / sizeof(int32_t);
if (adjustedLimit < fPattern->fFrameSize) {
adjustedLimit = fPattern->fFrameSize;
}
fStack->setMaxCapacity(adjustedLimit);
}
fStackLimit = limit;
}
//--------------------------------------------------------------------------------
//
// getStackLimit
//
//--------------------------------------------------------------------------------
int32_t RegexMatcher::getStackLimit() const {
return fStackLimit;
}
//--------------------------------------------------------------------------------
//
// setMatchCallback
//
//--------------------------------------------------------------------------------
void RegexMatcher::setMatchCallback(URegexMatchCallback callback,
const void *context,
UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
fCallbackFn = callback;
fCallbackContext = context;
}
//--------------------------------------------------------------------------------
//
// getMatchCallback
//
//--------------------------------------------------------------------------------
void RegexMatcher::getMatchCallback(URegexMatchCallback &callback,
const void *&context,
UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
callback = fCallbackFn;
context = fCallbackContext;
}
//================================================================================
//
@ -1187,6 +1353,31 @@ UBool RegexMatcher::isUWordBoundary(int32_t pos) {
return returnVal;
}
//--------------------------------------------------------------------------------
//
// IncrementTime This function is called once each TIMER_INITIAL_VALUE state
// saves. Increment the "time" counter, and call the
// user callback function if there is one installed.
//
// If the match operation needs to be aborted, either for a time-out
// or because the user callback asked for it, just set an error status.
// The engine will pick that up and stop in its outer loop.
//
//--------------------------------------------------------------------------------
void RegexMatcher::IncrementTime(UErrorCode &status) {
fTickCounter = TIMER_INITIAL_VALUE;
fTime++;
if (fCallbackFn != NULL) {
if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
status = U_REGEX_STOPPED_BY_CALLER;
return;
}
}
if (fTimeLimit > 0 && fTime >= fTimeLimit) {
status = U_REGEX_TIME_OUT;
}
}
//--------------------------------------------------------------------------------
//
// StateSave
@ -1196,13 +1387,33 @@ UBool RegexMatcher::isUWordBoundary(int32_t pos) {
// the newly created stack frame
//
// Note that reserveBlock() may grow the stack, resulting in the
// whole thing being relocated in memory.
// whole thing being relocated in memory.
//
// Parameters:
// fp The top frame pointer when called. At return, a new
// fame will be present
// savePatIdx An index into the compiled pattern. Goes into the original
// (not new) frame. If execution ever back-tracks out of the
// new frame, this will be where we continue from in the pattern.
// Return
// The new frame pointer.
//
//--------------------------------------------------------------------------------
inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, int32_t frameSize, UErrorCode &status) {
inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status) {
// push storage for a new frame.
int32_t *newFP = fStack->reserveBlock(frameSize, status);
fp = (REStackFrame *)(newFP - frameSize); // in case of realloc of stack.
int32_t *newFP = fStack->reserveBlock(fFrameSize, status);
if (newFP == NULL) {
// Failure on attempted stack expansion.
// Stack function set some other error code, change it to a more
// specific one for regular expressions.
status = U_REGEX_STACK_OVERFLOW;
// We need to return a writable stack frame, so just return the
// previous frame. The match operation will stop quickly
// becuase of the error status, after which the frame will never
// be looked at again.
return fp;
}
fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
// New stack frame = copy of old top frame.
int32_t *source = (int32_t *)fp;
@ -1214,6 +1425,10 @@ inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatId
}
}
fTickCounter--;
if (fTickCounter <= 0) {
IncrementTime(status); // Re-initializes fTickCounter
}
fp->fPatIdx = savePatIdx;
return (REStackFrame *)newFP;
}
@ -1262,7 +1477,6 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
}
// Cache frequently referenced items from the compiled pattern
// in local variables.
//
int32_t *pat = fPattern->fCompiledPat->getBuffer();
@ -1271,8 +1485,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
const UChar *inputBuf = fInput->getBuffer();
fFrameSize = fPattern->fFrameSize;
REStackFrame *fp = resetStack();
int32_t frameSize = fPattern->fFrameSize;
fp->fPatIdx = 0;
fp->fInputIdx = startIdx;
@ -1316,7 +1530,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
// Force a backtrack. In some circumstances, the pattern compiler
// will notice that the pattern can't possibly match anything, and will
// emit one of these at that point.
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
@ -1330,7 +1544,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
} else {
fHitEnd = TRUE;
}
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
@ -1352,7 +1566,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
if (fp->fInputIdx + stringLen > fActiveLimit) {
// No match. String is longer than the remaining input text.
fHitEnd = TRUE; // TODO: See ticket 6074
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
@ -1370,7 +1584,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
}
} else {
// Match failed.
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
}
@ -1380,7 +1594,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
case URX_STATE_SAVE:
fp = StateSave(fp, opValue, frameSize, status);
fp = StateSave(fp, opValue, status);
break;
@ -1389,7 +1603,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
// when we reach the end of the pattern.
if (toEnd && fp->fInputIdx != fActiveLimit) {
// The pattern matched, but not to the end of input. Try some more.
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
isMatch = TRUE;
@ -1401,26 +1615,26 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
// opValue+2 - the start of a capture group whose end
// has not yet been reached (and might not ever be).
case URX_START_CAPTURE:
U_ASSERT(opValue >= 0 && opValue < frameSize-3);
U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
fp->fExtra[opValue+2] = fp->fInputIdx;
break;
case URX_END_CAPTURE:
U_ASSERT(opValue >= 0 && opValue < frameSize-3);
U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
fp->fExtra[opValue+1] = fp->fInputIdx; // End position
U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
break;
case URX_DOLLAR: // $, test for End of line
// or for position before new line at end of input
if (fp->fInputIdx < fAnchorLimit-2) {
// We are no where near the end of input. Fail.
// This is the common case. Keep it first.
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
if (fp->fInputIdx >= fAnchorLimit) {
@ -1451,7 +1665,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
break; // At CR/LF at end of input. Success
}
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
@ -1475,7 +1689,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
}
// Not at end of input. Back-track out.
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
@ -1499,7 +1713,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
}
}
// not at a new line. Fail.
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
@ -1515,7 +1729,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
// If we are not positioned just before a new-line, the test fails; backtrack out.
// It makes no difference where the new-line is within the input.
if (inputBuf[fp->fInputIdx] != 0x0a) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
@ -1523,7 +1737,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
case URX_CARET: // ^, test for start of line
if (fp->fInputIdx != fAnchorStart) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
@ -1544,7 +1758,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
break;
}
// Not at the start of a line. Fail.
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
@ -1561,7 +1775,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
UChar c = inputBuf[fp->fInputIdx - 1];
if (c != 0x0a) {
// Not at the start of a line. Back-track out.
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
@ -1571,7 +1785,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
UBool success = isWordBoundary(fp->fInputIdx);
success ^= (opValue != 0); // flip sense for \B
if (!success) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
@ -1582,7 +1796,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
UBool success = isUWordBoundary(fp->fInputIdx);
success ^= (opValue != 0); // flip sense for \B
if (!success) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
@ -1592,7 +1806,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
@ -1603,7 +1817,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
if (success) {
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
} else {
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
@ -1611,7 +1825,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
case URX_BACKSLASH_G: // Test for position at end of previous match
if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
@ -1625,7 +1839,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
// Fail if at end of input
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
@ -1703,7 +1917,7 @@ GC_Done:
case URX_BACKSLASH_Z: // Test for end of Input
if (fp->fInputIdx < fAnchorLimit) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
} else {
fHitEnd = TRUE;
fRequireEnd = TRUE;
@ -1721,7 +1935,7 @@ GC_Done:
// 1: success if input char is not in set.
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
@ -1742,7 +1956,7 @@ GC_Done:
}
}
if (!success) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
@ -1754,7 +1968,7 @@ GC_Done:
// the predefined sets (Word Characters, for example)
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
@ -1773,7 +1987,7 @@ GC_Done:
}
}
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
@ -1781,7 +1995,7 @@ GC_Done:
case URX_SETREF:
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
// There is input left. Pick up one char and test it for set membership.
@ -1801,7 +2015,7 @@ GC_Done:
}
}
// the character wasn't in the set. Back track out.
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
@ -1811,7 +2025,7 @@ GC_Done:
if (fp->fInputIdx >= fActiveLimit) {
// At end of input. Match failed. Backtrack out.
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
// There is input left. Advance over one char, unless we've hit end-of-line
@ -1820,7 +2034,7 @@ GC_Done:
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
// End of line in normal mode. . does not match.
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
}
@ -1833,7 +2047,7 @@ GC_Done:
if (fp->fInputIdx >= fActiveLimit) {
// At end of input. Match failed. Backtrack out.
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
// There is input left. Advance over one char, except if we are
@ -1858,7 +2072,7 @@ GC_Done:
if (fp->fInputIdx >= fActiveLimit) {
// At end of input. Match failed. Backtrack out.
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
// There is input left. Advance over one char, unless we've hit end-of-line
@ -1866,7 +2080,7 @@ GC_Done:
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (c == 0x0a) {
// End of line in normal mode. '.' does not match the \n
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
@ -1882,8 +2096,8 @@ GC_Done:
case URX_JMP_SAV:
U_ASSERT(opValue < fPattern->fCompiledPat->size());
fp = StateSave(fp, fp->fPatIdx, frameSize, status); // State save to loc following current
fp->fPatIdx = opValue; // Then JMP.
fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
fp->fPatIdx = opValue; // Then JMP.
break;
case URX_JMP_SAV_X:
@ -1896,12 +2110,12 @@ GC_Done:
int32_t stoOp = pat[opValue-1];
U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
int32_t frameLoc = URX_VAL(stoOp);
U_ASSERT(frameLoc >= 0 && frameLoc < frameSize);
U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
int32_t prevInputIdx = fp->fExtra[frameLoc];
U_ASSERT(prevInputIdx <= fp->fInputIdx);
if (prevInputIdx < fp->fInputIdx) {
// The match did make progress. Repeat the loop.
fp = StateSave(fp, fp->fPatIdx, frameSize, status); // State save to loc following current
fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
fp->fPatIdx = opValue;
fp->fExtra[frameLoc] = fp->fInputIdx;
}
@ -1912,7 +2126,7 @@ GC_Done:
case URX_CTR_INIT:
{
U_ASSERT(opValue >= 0 && opValue < frameSize-2);
U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
// Pick up the three extra operands that CTR_INIT has, and
@ -1927,10 +2141,10 @@ GC_Done:
U_ASSERT(loopLoc>fp->fPatIdx);
if (minCount == 0) {
fp = StateSave(fp, loopLoc+1, frameSize, status);
fp = StateSave(fp, loopLoc+1, status);
}
if (maxCount == 0) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
@ -1953,7 +2167,7 @@ GC_Done:
break;
}
if (*pCounter >= minCount) {
fp = StateSave(fp, fp->fPatIdx, frameSize, status);
fp = StateSave(fp, fp->fPatIdx, status);
}
fp->fPatIdx = opValue + 4; // Loop back.
}
@ -1962,7 +2176,7 @@ GC_Done:
case URX_CTR_INIT_NG:
{
// Initialize a non-greedy loop
U_ASSERT(opValue >= 0 && opValue < frameSize-2);
U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
// Pick up the three extra operands that CTR_INIT has, and
@ -1978,7 +2192,7 @@ GC_Done:
if (minCount == 0) {
if (maxCount != 0) {
fp = StateSave(fp, fp->fPatIdx, frameSize, status);
fp = StateSave(fp, fp->fPatIdx, status);
}
fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
}
@ -2017,7 +2231,7 @@ GC_Done:
// Fall into the following pattern, but first do
// a state save to the top of the loop, so that a failure
// in the following pattern will try another iteration of the loop.
fp = StateSave(fp, opValue + 4, frameSize, status);
fp = StateSave(fp, opValue + 4, status);
}
}
break;
@ -2032,12 +2246,12 @@ GC_Done:
U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
int32_t newStackSize = fData[opValue];
U_ASSERT(newStackSize <= fStack->size());
int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize;
int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
if (newFP == (int32_t *)fp) {
break;
}
int32_t i;
for (i=0; i<frameSize; i++) {
for (i=0; i<fFrameSize; i++) {
newFP[i] = ((int32_t *)fp)[i];
}
fp = (REStackFrame *)newFP;
@ -2048,14 +2262,14 @@ GC_Done:
case URX_BACKREF:
case URX_BACKREF_I:
{
U_ASSERT(opValue < frameSize);
U_ASSERT(opValue < fFrameSize);
int32_t groupStartIdx = fp->fExtra[opValue];
int32_t groupEndIdx = fp->fExtra[opValue+1];
U_ASSERT(groupStartIdx <= groupEndIdx);
int32_t len = groupEndIdx-groupStartIdx;
if (groupStartIdx < 0) {
// This capture group has not participated in the match thus far,
fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match.
fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
}
if (len == 0) {
@ -2085,14 +2299,14 @@ GC_Done:
if (haveMatch) {
fp->fInputIdx += len; // Match. Advance current input position.
} else {
fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match.
fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
}
}
break;
case URX_STO_INP_LOC:
{
U_ASSERT(opValue >= 0 && opValue < frameSize);
U_ASSERT(opValue >= 0 && opValue < fFrameSize);
fp->fExtra[opValue] = fp->fInputIdx;
}
break;
@ -2102,13 +2316,13 @@ GC_Done:
int32_t instrOperandLoc = fp->fPatIdx;
fp->fPatIdx += 1;
int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
U_ASSERT(dataLoc >= 0 && dataLoc < frameSize);
U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
int32_t savedInputIdx = fp->fExtra[dataLoc];
U_ASSERT(savedInputIdx <= fp->fInputIdx);
if (savedInputIdx < fp->fInputIdx) {
fp->fPatIdx = opValue; // JMP
} else {
fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no progress in loop.
fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
}
}
break;
@ -2137,9 +2351,9 @@ GC_Done:
// Copy the current top frame back to the new (cut back) top frame.
// This makes the capture groups from within the look-ahead
// expression available.
int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize;
int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
int32_t i;
for (i=0; i<frameSize; i++) {
for (i=0; i<fFrameSize; i++) {
newFP[i] = ((int32_t *)fp)[i];
}
fp = (REStackFrame *)newFP;
@ -2164,7 +2378,7 @@ GC_Done:
} else {
fHitEnd = TRUE;
}
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
case URX_STRING_I:
@ -2195,7 +2409,7 @@ GC_Done:
fHitEnd = TRUE; // See ticket 6074
}
// No match. Back up matching to a saved state
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
@ -2249,7 +2463,7 @@ GC_Done:
// We have tried all potential match starting points without
// getting a match. Backtrack out, and out of the
// Look Behind altogether.
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
int32_t restoreInputLen = fData[opValue+3];
U_ASSERT(restoreInputLen >= fActiveLimit);
U_ASSERT(restoreInputLen <= fInput->length());
@ -2259,7 +2473,7 @@ GC_Done:
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
// (successful match will fall off the end of the loop.)
fp = StateSave(fp, fp->fPatIdx-3, frameSize, status);
fp = StateSave(fp, fp->fPatIdx-3, status);
fp->fInputIdx = *lbStartIdx;
}
break;
@ -2274,7 +2488,7 @@ GC_Done:
// FAIL out of here, which will take us back to the LB_CONT, which
// will retry the match starting at another position or fail
// the look-behind altogether, whichever is appropriate.
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
@ -2333,7 +2547,7 @@ GC_Done:
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
// (successful match will cause a FAIL out of the loop altogether.)
fp = StateSave(fp, fp->fPatIdx-4, frameSize, status);
fp = StateSave(fp, fp->fPatIdx-4, status);
fp->fInputIdx = *lbStartIdx;
}
break;
@ -2348,7 +2562,7 @@ GC_Done:
// FAIL out of here, which will take us back to the LB_CONT, which
// will retry the match starting at another position or succeed
// the look-behind altogether, whichever is appropriate.
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
@ -2372,7 +2586,7 @@ GC_Done:
// FAIL, which will take control back to someplace
// prior to entering the look-behind test.
fp = (REStackFrame *)fStack->popFrame(frameSize);
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
@ -2423,14 +2637,14 @@ GC_Done:
int32_t loopcOp = pat[fp->fPatIdx];
U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
int32_t stackLoc = URX_VAL(loopcOp);
U_ASSERT(stackLoc >= 0 && stackLoc < frameSize);
U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
fp->fExtra[stackLoc] = fp->fInputIdx;
fp->fInputIdx = ix;
// Save State to the URX_LOOP_C op that follows this one,
// so that match failures in the following code will return to there.
// Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
fp = StateSave(fp, fp->fPatIdx, frameSize, status);
fp = StateSave(fp, fp->fPatIdx, status);
fp->fPatIdx++;
}
break;
@ -2486,14 +2700,14 @@ GC_Done:
int32_t loopcOp = pat[fp->fPatIdx];
U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
int32_t stackLoc = URX_VAL(loopcOp);
U_ASSERT(stackLoc >= 0 && stackLoc < frameSize);
U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
fp->fExtra[stackLoc] = fp->fInputIdx;
fp->fInputIdx = ix;
// Save State to the URX_LOOP_C op that follows this one,
// so that match failures in the following code will return to there.
// Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
fp = StateSave(fp, fp->fPatIdx, frameSize, status);
fp = StateSave(fp, fp->fPatIdx, status);
fp->fPatIdx++;
}
break;
@ -2501,7 +2715,7 @@ GC_Done:
case URX_LOOP_C:
{
U_ASSERT(opValue>=0 && opValue<frameSize);
U_ASSERT(opValue>=0 && opValue<fFrameSize);
int32_t terminalIdx = fp->fExtra[opValue];
U_ASSERT(terminalIdx <= fp->fInputIdx);
if (terminalIdx == fp->fInputIdx) {
@ -2528,7 +2742,7 @@ GC_Done:
}
fp = StateSave(fp, fp->fPatIdx-1, frameSize, status);
fp = StateSave(fp, fp->fPatIdx-1, status);
}
break;
@ -2541,6 +2755,7 @@ GC_Done:
}
if (U_FAILURE(status)) {
isMatch = FALSE;
break;
}
}

View file

@ -1003,6 +1003,102 @@ public:
int32_t destCapacity,
UErrorCode &status);
/**
* Set a processing time limit for match operations with this Matcher.
*
* Some patterns, when matching certain strings, can run in exponential time.
* For practical purposes, the match operation may appear to be in an
* infinite loop.
* When a limit is set a match operation will fail with an error if the
* limit is exceeded.
* <p>
* The units of the limit are steps of the match engine.
* Correspondence with actual processor time will depend on the speed
* of the processor and the details of the specific pattern, but will
* typically be on the order of milliseconds.
* <p>
* By default, the matching time is not limited.
* <p>
*
* @param limit The limit value, or 0 for no limit.
* @param status A reference to a UErrorCode to receive any errors.
* @draft ICU 4.0
*/
virtual void setTimeLimit(int32_t limit, UErrorCode &status);
/**
* Get the time limit, if any, for match operations made with this Matcher.
*
* @return the maximum allowed time for a match, in units of processing steps.
* @draft ICU 4.0
*/
virtual int32_t getTimeLimit() const;
/**
* Set the amount of heap storage avaliable for use by the match backtracking stack.
* The matcher is also reset, discarding any results from previous matches.
* <p>
* ICU uses a backtracking regular expression engine, with the backtrack stack
* maintained on the heap. This function sets the limit to the amount of memory
* that can be used for this purpose. A backtracking stack overflow will
* result in an error from the match operation that caused it.
* <p>
* A limit is desirable because a malicious or poorly designed pattern can use
* excessive memory, potentially crashing the process. A limit is enabled
* by default.
* <p>
* @param limit The maximum size, in bytes, of the matching backtrack stack.
* A value of zero means no limit.
* The limit must be greater or equal to zero.
*
* @param status A reference to a UErrorCode to receive any errors.
*
* @draft ICU 4.0
*/
virtual void setStackLimit(int32_t limit, UErrorCode &status);
/**
* Get the size of the heap storage available for use by the back tracking stack.
*
* @return the maximum backtracking stack size, in bytes, or zero if the
* stack size is unlimited.
* @draft ICU 4.0
*/
virtual int32_t getStackLimit() const;
/**
* Set a callback function for use with this Matcher.
* During matching operations the function will be called periodically,
* giving the application the opportunity to terminate a long-running
* match.
*
* @param callback A pointer to the user-supplied callback function.
* @param context User context pointer. The value supplied at the
* time the callback function is set will be saved
* and passed to the callback each time that it is called.
* @param status A reference to a UErrorCode to receive any errors.
* @draft ICU 4.0
*/
virtual void setMatchCallback(URegexMatchCallback callback,
const void *context,
UErrorCode &status);
/**
* Get the callback function for this URegularExpression.
*
* @param callback Out paramater, receives a pointer to the user-supplied
* callback function.
* @param context Out parameter, receives the user context pointer that
* was set when uregex_setMatchCallback() was called.
* @param status A reference to a UErrorCode to receive any errors.
* @draft ICU 4.0
*/
virtual void getMatchCallback(URegexMatchCallback &callback,
const void *&context,
UErrorCode &status);
/**
@ -1030,10 +1126,13 @@ public:
private:
// Constructors and other object boilerplate are private.
// Instances of RegexMatcher can not be assigned, copied, cloned, etc.
RegexMatcher(); // default constructor not implemented
RegexMatcher(); // default constructor not implemented
RegexMatcher(const RegexPattern *pat);
RegexMatcher(const RegexMatcher &other);
RegexMatcher &operator =(const RegexMatcher &rhs);
void init(UErrorCode &status); // Common initialization
void init2(const UnicodeString &s, UErrorCode &e); // Common initialization, part 2.
friend class RegexPattern;
friend class RegexCImpl;
public:
@ -1050,8 +1149,8 @@ private:
UBool isWordBoundary(int32_t pos); // perform Perl-like \b test
UBool isUWordBoundary(int32_t pos); // perform RBBI based \b test
REStackFrame *resetStack();
inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx,
int32_t frameSize, UErrorCode &status);
inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status);
void IncrementTime(UErrorCode &status);
const RegexPattern *fPattern;
@ -1059,6 +1158,7 @@ private:
// should delete it when through.
const UnicodeString *fInput; // The text being matched. Is never NULL.
int32_t fFrameSize; // The size of a frame in the backtrack stack.
int32_t fRegionStart; // Start of the input region, default = 0.
int32_t fRegionLimit; // End of input region, default to input.length.
@ -1101,9 +1201,25 @@ private:
int32_t *fData; // Data area for use by the compiled pattern.
int32_t fSmallData[8]; // Use this for data if it's enough.
int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
// match engine run. Zero for unlimited.
int32_t fTime; // Match time, accumulates while matching.
int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
// Kept separately from fTime to keep as much
// code as possible out of the inline
// StateSave function.
int32_t fStackLimit; // Maximum memory size to use for the backtrack
// stack, in bytes. Zero for unlimited.
URegexMatchCallback fCallbackFn; // Pointer to match progress callback funct.
// NULL if there is no callback.
const void *fCallbackContext; // User Context ptr for callback function.
UBool fTraceDebug; // Set true for debug tracing of match engine.
UErrorCode fDeferredStatus; // Save error state if that cannot be immediately
UErrorCode fDeferredStatus; // Save error state that cannot be immediately
// reported, or that permanently disables this matcher.
RuleBasedBreakIterator *fWordBreakItr;

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2004-2007, International Business Machines
* Copyright (C) 2004-2008, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: regex.h
@ -843,5 +843,145 @@ uregex_split( URegularExpression *regexp,
/**
* Set a processing time limit for match operations with this URegularExpression.
*
* Some patterns, when matching certain strings, can run in exponential time.
* For practical purposes, the match operation may appear to be in an
* infinite loop.
* When a limit is set a match operation will fail with an error if the
* limit is exceeded.
* <p>
* The units of the limit are steps of the match engine.
* Correspondence with actual processor time will depend on the speed
* of the processor and the details of the specific pattern, but will
* typically be on the order of milliseconds.
* <p>
* By default, the matching time is not limited.
* <p>
*
* @param regexp The compiled regular expression.
* @param limit The limit value, or 0 for no limit.
* @param status A reference to a UErrorCode to receive any errors.
* @draft ICU 4.0
*/
U_DRAFT void U_EXPORT2
uregex_setTimeLimit(URegularExpression *regexp,
int32_t limit,
UErrorCode *status);
/**
* Get the time limit for for matches with this URegularExpression.
* A return value of zero indicates that there is no limit.
*
* @param regexp The compiled regular expression.
* @param status A reference to a UErrorCode to receive any errors.
* @return the maximum allowed time for a match, in units of processing steps.
* @draft ICU 4.0
*/
U_DRAFT int32_t U_EXPORT2
uregex_getTimeLimit(const URegularExpression *regexp,
UErrorCode *status);
/**
* Set the amount of heap storage avaliable for use by the match backtracking stack.
* <p>
* ICU uses a backtracking regular expression engine, with the backtrack stack
* maintained on the heap. This function sets the limit to the amount of memory
* that can be used for this purpose. A backtracking stack overflow will
* result in an error from the match operation that caused it.
* <p>
* A limit is desirable because a malicious or poorly designed pattern can use
* excessive memory, potentially crashing the process. A limit is enabled
* by default.
* <p>
* @param regexp The compiled regular expression.
* @param limit The maximum size, in bytes, of the matching backtrack stack.
* A value of -1 means no limit.
* The limit must be greater than zero, or -1.
* @param status A reference to a UErrorCode to receive any errors.
*
* @draft ICU 4.0
*/
U_DRAFT void U_EXPORT2
uregex_setStackLimit(URegularExpression *regexp,
int32_t limit,
UErrorCode *status);
/**
* Get the size of the heap storage available for use by the back tracking stack.
*
* @return the maximum backtracking stack size, in bytes, or zero if the
* stack size is unlimited.
* @draft ICU 4.0
*/
U_DRAFT int32_t U_EXPORT2
uregex_getStackLimit(const URegularExpression *regexp,
UErrorCode *status);
/**
* Function pointer for a regular expression matching callback function.
* When set, a callback function will be called periodically during matching
* operations. If the call back function returns FALSE, the matching
* operation will be terminated early.
*
* Note: the callback function must not call other functions on this
* URegularExpression.
*
* @param context context pointer. The callback function will be invoked
* with the context specified at the time that
* uregex_setMatchCallback() is called.
* @param steps the accumulated processing time, in match steps,
* for this matching operation.
* @return TRUE to continue the matching operation.
* FALSE to terminate the matching operation.
* @draft ICU 4.0
*/
typedef UBool (U_EXPORT2 *URegexMatchCallback) (
const void *context,
int32_t steps);
/**
* Set a callback function for this URegularExpression.
* During matching operations the function will be called periodically,
* giving the application the opportunity to terminate a long-running
* match.
*
* @param regexp The compiled regular expression.
* @param callback A pointer to the user-supplied callback function.
* @param context User context pointer. The value supplied at the
* time the callback function is set will be saved
* and passed to the callback each time that it is called.
* @param status A reference to a UErrorCode to receive any errors.
* @draft ICU 4.0
*/
U_DRAFT void U_EXPORT2
uregex_setMatchCallback(URegularExpression *regexp,
URegexMatchCallback callback,
const void *context,
UErrorCode *status);
/**
* Get the callback function for this URegularExpression.
*
* @param regexp The compiled regular expression.
* @param callback Out paramater, receives a pointer to the user-supplied
* callback function.
* @param context Out parameter, receives the user context pointer that
* was set when uregex_setMatchCallback() was called.
* @param status A reference to a UErrorCode to receive any errors.
* @draft ICU 4.0
*/
U_DRAFT void U_EXPORT2
uregex_getMatchCallback(const URegularExpression *regexp,
URegexMatchCallback *callback,
const void **context,
UErrorCode *status);
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
#endif /* UREGEX_H */

View file

@ -74,7 +74,6 @@ static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool
return FALSE;
}
if (re == NULL || re->fMagic != REXP_MAGIC) {
// U_ASSERT(FALSE);
*status = U_ILLEGAL_ARGUMENT_ERROR;
return FALSE;
}
@ -630,6 +629,103 @@ uregex_requireEnd(const URegularExpression *regexp,
}
//------------------------------------------------------------------------------
//
// uregex_setTimeLimit
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_setTimeLimit(URegularExpression *regexp,
int32_t limit,
UErrorCode *status) {
if (validateRE(regexp, status)) {
regexp->fMatcher->setTimeLimit(limit, *status);
}
}
//------------------------------------------------------------------------------
//
// uregex_getTimeLimit
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_getTimeLimit(const URegularExpression *regexp,
UErrorCode *status) {
int32_t retVal = 0;
if (validateRE(regexp, status)) {
retVal = regexp->fMatcher->getTimeLimit();
}
return retVal;
}
//------------------------------------------------------------------------------
//
// uregex_setStackLimit
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_setStackLimit(URegularExpression *regexp,
int32_t limit,
UErrorCode *status) {
if (validateRE(regexp, status)) {
regexp->fMatcher->setStackLimit(limit, *status);
}
}
//------------------------------------------------------------------------------
//
// uregex_getStackLimit
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_getStackLimit(const URegularExpression *regexp,
UErrorCode *status) {
int32_t retVal = 0;
if (validateRE(regexp, status)) {
retVal = regexp->fMatcher->getStackLimit();
}
return retVal;
}
//------------------------------------------------------------------------------
//
// uregex_setMatchCallback
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_setMatchCallback(URegularExpression *regexp,
URegexMatchCallback callback,
const void *context,
UErrorCode *status) {
if (validateRE(regexp, status)) {
regexp->fMatcher->setMatchCallback(callback, context, *status);
}
}
//------------------------------------------------------------------------------
//
// uregex_getMatchCallback
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_getMatchCallback(const URegularExpression *regexp,
URegexMatchCallback *callback,
const void **context,
UErrorCode *status) {
if (validateRE(regexp, status)) {
regexp->fMatcher->getMatchCallback(*callback, *context, *status);
}
}
//------------------------------------------------------------------------------
//
// uregex_replaceAll

View file

@ -100,7 +100,31 @@ void addURegexTest(TestNode** root)
addTest(root, &TestBug4315, "regex/TestBug4315");
}
/*
* Call back function and context struct used for testing
* regular expression user callbacks. This test is mostly the same as
* the corresponding C++ test in intltest.
*/
typedef struct callBackContext {
int32_t maxCalls;
int32_t numCalls;
int32_t lastSteps;
} callBackContext;
static UBool U_EXPORT2 U_CALLCONV
TestCallbackFn(const void *context, int32_t steps) {
callBackContext *info = (callBackContext *)context;
if (info->lastSteps+1 != steps) {
log_err("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
}
info->lastSteps = steps;
info->numCalls++;
return (info->numCalls < info->maxCalls);
}
/*
* Regular Expression C API Tests
*/
static void TestRegexCAPI(void) {
UErrorCode status = U_ZERO_ERROR;
URegularExpression *re;
@ -1144,8 +1168,72 @@ static void TestRegexCAPI(void) {
uregex_close(re);
}
/*
* set/getTimeLimit
*/
TEST_SETUP("abc$", "abcdef", 0);
TEST_ASSERT(uregex_getTimeLimit(re, &status) == 0);
uregex_setTimeLimit(re, 1000, &status);
TEST_ASSERT(uregex_getTimeLimit(re, &status) == 1000);
TEST_ASSERT_SUCCESS(status);
uregex_setTimeLimit(re, -1, &status);
TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
status = U_ZERO_ERROR;
TEST_ASSERT(uregex_getTimeLimit(re, &status) == 1000);
TEST_TEARDOWN;
/*
* set/get Stack Limit
*/
TEST_SETUP("abc$", "abcdef", 0);
TEST_ASSERT(uregex_getStackLimit(re, &status) == 8000000);
uregex_setStackLimit(re, 40000, &status);
TEST_ASSERT(uregex_getStackLimit(re, &status) == 40000);
TEST_ASSERT_SUCCESS(status);
uregex_setStackLimit(re, -1, &status);
TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
status = U_ZERO_ERROR;
TEST_ASSERT(uregex_getStackLimit(re, &status) == 40000);
TEST_TEARDOWN;
/*
* Get/Set callback functions
* This test is copied from intltest regex/Callbacks
* The pattern and test data will run long enough to cause the callback
* to be invoked. The nested '+' operators give exponential time
* behavior with increasing string length.
*/
TEST_SETUP("((.)+\\2)+x", "aaaaaaaaaaaaaaaaaaab", 0)
callBackContext cbInfo = {4, 0, 0};
const void *pContext = &cbInfo;
URegexMatchCallback returnedFn = &TestCallbackFn;
/* Getting the callback fn when it hasn't been set must return NULL */
uregex_getMatchCallback(re, &returnedFn, &pContext, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(returnedFn == NULL);
TEST_ASSERT(pContext == NULL);
/* Set thecallback and do a match. */
/* The callback function should record that it has been called. */
uregex_setMatchCallback(re, &TestCallbackFn, &cbInfo, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(cbInfo.numCalls == 0);
TEST_ASSERT(uregex_matches(re, -1, &status) == FALSE);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(cbInfo.numCalls > 0);
/* Getting the callback should return the values that were set above. */
uregex_getMatchCallback(re, &returnedFn, &pContext, &status);
TEST_ASSERT(returnedFn == &TestCallbackFn);
TEST_ASSERT(pContext == &cbInfo);
TEST_TEARDOWN;
}
static void TestBug4315(void) {
UErrorCode theICUError = U_ZERO_ERROR;
URegularExpression *theRegEx;

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2002-2007, International Business Machines Corporation and
* Copyright (c) 2002-2008, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -66,7 +66,9 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
case 6: name = "PerlTests";
if (exec) PerlTests();
break;
case 7: name = "Callbacks";
if (exec) Callbacks();
break;
default: name = "";
break; //needed to end loop
@ -837,6 +839,90 @@ void RegexTest::API_Match() {
}
#endif
//
// Time Outs.
// Note: These tests will need to be changed when the regexp engine is
// able to detect and cut short the exponential time behavior on
// this type of match.
//
{
UErrorCode status = U_ZERO_ERROR;
// Enough 'a's in the string to cause the match to time out.
// (Each on additonal 'a' doubles the time)
UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
RegexMatcher matcher("(a+)+b", testString, 0, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher.getTimeLimit() == 0);
matcher.setTimeLimit(100, status);
REGEX_ASSERT(matcher.getTimeLimit() == 100);
REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
REGEX_ASSERT(status == U_REGEX_TIME_OUT);
}
{
UErrorCode status = U_ZERO_ERROR;
// Few enough 'a's to slip in under the time limit.
UnicodeString testString("aaaaaaaaaaaaaaaaaa");
RegexMatcher matcher("(a+)+b", testString, 0, status);
REGEX_CHECK_STATUS;
matcher.setTimeLimit(100, status);
REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
REGEX_CHECK_STATUS;
}
//
// Stack Limits
//
{
UErrorCode status = U_ZERO_ERROR;
UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
// Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
// of the '+', and makes the stack frames larger.
RegexMatcher matcher("(A)+A$", testString, 0, status);
// With the default stack, this match should fail to run
REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
// With unlimited stack, it should run
status = U_ZERO_ERROR;
matcher.setStackLimit(0, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher.getStackLimit() == 0);
// With a limited stack, it the match should fail
status = U_ZERO_ERROR;
matcher.setStackLimit(10000, status);
REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
REGEX_ASSERT(matcher.getStackLimit() == 10000);
}
// A pattern that doesn't save state should work with
// a minimal sized stack
{
UErrorCode status = U_ZERO_ERROR;
UnicodeString testString = "abc";
RegexMatcher matcher("abc", testString, 0, status);
REGEX_CHECK_STATUS;
matcher.setStackLimit(30, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher.matches(status) == TRUE);
REGEX_CHECK_STATUS;
REGEX_ASSERT(matcher.getStackLimit() == 30);
// Negative stack sizes should fail
status = U_ZERO_ERROR;
matcher.setStackLimit(1000, status);
REGEX_CHECK_STATUS;
matcher.setStackLimit(-1, status);
REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
REGEX_ASSERT(matcher.getStackLimit() == 1000);
}
}
@ -2299,6 +2385,98 @@ void RegexTest::PerlTests() {
}
//
// Callbacks() Test the callback function.
// When set, callbacks occur periodically during matching operations,
// giving the application code the ability to abort the operation
// before it's normal completion.
//
struct callBackContext {
RegexTest *test;
int32_t maxCalls;
int32_t numCalls;
int32_t lastSteps;
void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
};
U_CDECL_BEGIN
static UBool U_CALLCONV
testCallBackFn(const void *context, int32_t steps) {
callBackContext *info = (callBackContext *)context;
if (info->lastSteps+1 != steps) {
info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
}
info->lastSteps = steps;
info->numCalls++;
return (info->numCalls < info->maxCalls);
}
U_CDECL_END
void RegexTest::Callbacks() {
{
// Getter returns NULLs if no callback has been set
// The variables that the getter will fill in.
// Init to non-null values so that the action of the getter can be seen.
const void *returnedContext = &returnedContext;
URegexMatchCallback returnedFn = &testCallBackFn;
UErrorCode status = U_ZERO_ERROR;
RegexMatcher matcher("x", 0, status);
REGEX_CHECK_STATUS;
matcher.getMatchCallback(returnedFn, returnedContext, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(returnedFn == NULL);
REGEX_ASSERT(returnedContext == NULL);
}
{
// Set and Get work
callBackContext cbInfo = {this, 0, 0, 0};
const void *returnedContext;
URegexMatchCallback returnedFn;
UErrorCode status = U_ZERO_ERROR;
RegexMatcher matcher("((.)+\\2)+x", 0, status); // A pattern that can run long.
REGEX_CHECK_STATUS;
matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
REGEX_CHECK_STATUS;
matcher.getMatchCallback(returnedFn, returnedContext, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(returnedFn == &testCallBackFn);
REGEX_ASSERT(returnedContext == &cbInfo);
// A short-running match shouldn't invoke the callback
status = U_ZERO_ERROR;
cbInfo.reset(1);
UnicodeString s = "xxx";
matcher.reset(s);
REGEX_ASSERT(matcher.matches(status));
REGEX_CHECK_STATUS;
REGEX_ASSERT(cbInfo.numCalls == 0);
// A medium-length match that runs long enough to invoke the
// callback, but not so long that the callback aborts it.
status = U_ZERO_ERROR;
cbInfo.reset(4);
s = "aaaaaaaaaaaaaaaaaaab";
matcher.reset(s);
REGEX_ASSERT(matcher.matches(status)==FALSE);
REGEX_CHECK_STATUS;
REGEX_ASSERT(cbInfo.numCalls > 0);
// A longer running match that the callback function will abort.
status = U_ZERO_ERROR;
cbInfo.reset(4);
s = "aaaaaaaaaaaaaaaaaaaaaaab";
matcher.reset(s);
REGEX_ASSERT(matcher.matches(status)==FALSE);
REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
REGEX_ASSERT(cbInfo.numCalls == 4);
}
}
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2002-2007, International Business Machines Corporation and
* Copyright (c) 2002-2008, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -30,6 +30,7 @@ public:
virtual void Extended();
virtual void Errors();
virtual void PerlTests();
virtual void Callbacks();
// The following functions are internal to the regexp tests.
virtual UBool doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line);