diff --git a/icu4c/source/common/unicode/utypes.h b/icu4c/source/common/unicode/utypes.h index 6fe325ba5d5..12977ed5e42 100644 --- a/icu4c/source/common/unicode/utypes.h +++ b/icu4c/source/common/unicode/utypes.h @@ -729,6 +729,9 @@ typedef enum UErrorCode { U_REGEX_OCTAL_TOO_BIG, /**< Octal character constants must be <= 0377. */ U_REGEX_MISSING_CLOSE_BRACKET, /**< Missing closing bracket on a bracket expression. */ U_REGEX_INVALID_RANGE, /**< In a character range [x-y], x is greater than y. */ + U_REGEX_STACK_OVERFLOW, /**< Regular expression backtrack stack overflow. */ + U_REGEX_TIME_OUT, /**< Maximum allowed match time exceeded */ + U_REGEX_STOPPED_BY_CALLER, /**< Matching operation aborted by user callback fn. */ U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */ /* diff --git a/icu4c/source/common/utypes.c b/icu4c/source/common/utypes.c index 1674a737d8c..0e24e6a64ac 100644 --- a/icu4c/source/common/utypes.c +++ b/icu4c/source/common/utypes.c @@ -1,7 +1,7 @@ /* ****************************************************************************** * -* Copyright (C) 1997-2007, International Business Machines +* Copyright (C) 1997-2008, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -156,7 +156,10 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = { "U_REGEX_SET_CONTAINS_STRING", "U_REGEX_OCTAL_TOO_BIG", "U_REGEX_MISSING_CLOSE_BRACKET", - "U_REGEX_INVALID_RANGE" + "U_REGEX_INVALID_RANGE", + "U_REGEX_STACK_OVERFLOW", + "U_REGEX_TIME_OUT", + "U_REGEX_STOPPED_BY_CALLER" }; static const char * const diff --git a/icu4c/source/common/uvectr32.cpp b/icu4c/source/common/uvectr32.cpp index b9c0f751bb4..2ccb9063525 100644 --- a/icu4c/source/common/uvectr32.cpp +++ b/icu4c/source/common/uvectr32.cpp @@ -1,6 +1,6 @@ /* ****************************************************************************** -* Copyright (C) 1999-2003, International Business Machines Corporation and * +* Copyright (C) 1999-2008, International Business Machines Corporation and * * others. All Rights Reserved. * ****************************************************************************** * Date Name Description @@ -26,6 +26,7 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UVector32) UVector32::UVector32(UErrorCode &status) : count(0), capacity(0), + maxCapacity(0), elements(NULL) { _init(DEFUALT_CAPACITY, status); @@ -34,6 +35,7 @@ UVector32::UVector32(UErrorCode &status) : UVector32::UVector32(int32_t initialCapacity, UErrorCode &status) : count(0), capacity(0), + maxCapacity(0), elements(0) { _init(initialCapacity, status); @@ -46,6 +48,9 @@ void UVector32::_init(int32_t initialCapacity, UErrorCode &status) { if (initialCapacity < 1) { initialCapacity = DEFUALT_CAPACITY; } + if (maxCapacity>0 && maxCapacity= minimumCapacity) { + return TRUE; } + if (maxCapacity>0 && minimumCapacity>maxCapacity) { + status = U_BUFFER_OVERFLOW_ERROR; + return FALSE; + } + int32_t newCap = capacity * 2; + if (newCap < minimumCapacity) { + newCap = minimumCapacity; + } + if (maxCapacity > 0 && newCap > maxCapacity) { + newCap = maxCapacity; + } + int32_t* newElems = (int32_t *)uprv_realloc(elements, sizeof(int32_t)*newCap); + if (newElems == NULL) { + // We keep the original contents on the memory failure on realloc. + status = U_MEMORY_ALLOCATION_ERROR; + return FALSE; + } + elements = newElems; + capacity = newCap; return TRUE; } +void UVector32::setMaxCapacity(int32_t limit) { + U_ASSERT(limit >= 0); + maxCapacity = limit; + if (maxCapacity < 0) { + maxCapacity = 0; + } + if (capacity <= maxCapacity || maxCapacity == 0) { + // Current capacity is within the new limit. + return; + } + + // New maximum capacity is smaller than the current size. + // Realloc the storage to the new, smaller size. + int32_t* newElems = (int32_t *)uprv_realloc(elements, sizeof(int32_t)*maxCapacity); + if (newElems == NULL) { + // Realloc to smaller failed. + // Just keep what we had. No need to call it a failure. + return; + } + elements = newElems; + capacity = maxCapacity; + if (count > capacity) { + count = capacity; + } +} + /** * Change the size of this vector as follows: If newSize is smaller, * then truncate the array, possibly deleting held elements for i >= diff --git a/icu4c/source/common/uvectr32.h b/icu4c/source/common/uvectr32.h index 68a3ee2e2fe..bec5dd42e70 100644 --- a/icu4c/source/common/uvectr32.h +++ b/icu4c/source/common/uvectr32.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1999-2006, International Business Machines +* Copyright (C) 1999-2008, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -61,6 +61,8 @@ private: int32_t count; int32_t capacity; + + int32_t maxCapacity; // Limit beyond which capacity is not permitted to grow. int32_t* elements; @@ -161,6 +163,14 @@ public: */ int32_t *getBuffer() const; + /** + * Set the maximum allowed buffer capacity for this vector/stack. + * Default with no limit set is unlimited, go until malloc() fails. + * A Limit of zero means unlimited capacity. + * Units are vector elements (32 bits each), not bytes. + */ + void setMaxCapacity(int32_t limit); + /** * ICU "poor man's RTTI", returns a UClassID for this class. */ @@ -221,7 +231,9 @@ inline void UVector32::addElement(int32_t elem, UErrorCode &status) { } inline int32_t *UVector32::reserveBlock(int32_t size, UErrorCode &status) { - ensureCapacity(count+size, status); + if (ensureCapacity(count+size, status) == FALSE) { + return NULL; + } int32_t *rp = elements+count; count += size; return rp; diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index f81afa2d218..d0d92d57d93 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -1,6 +1,6 @@ /* ************************************************************************** -* Copyright (C) 2002-2007 International Business Machines Corporation * +* Copyright (C) 2002-2008 International Business Machines Corporation * * and others. All rights reserved. * ************************************************************************** */ @@ -30,90 +30,69 @@ U_NAMESPACE_BEGIN +// Default limit for the size of the back track stack, to avoid system +// failures causedby heap exhaustion. Units are in 32 bit words, not bytes. +// This value puts ICU's limits higher than most other regexp implementations, +// which use recursion rather than the heap, and take more storage per +// backtrack point. +// +static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; + +// Time limit counter constant. +// Time limits for expression evaluation are in terms of quanta of work by +// the engine, each of which is 10,000 state saves. +// This constant determines that state saves per tick number. +static const int32_t TIMER_INITIAL_VALUE = 10000; + //----------------------------------------------------------------------------- // // Constructor and Destructor // //----------------------------------------------------------------------------- RegexMatcher::RegexMatcher(const RegexPattern *pat) { - fPattern = pat; - fPatternOwned = NULL; - fInput = NULL; - fTraceDebug = FALSE; - fDeferredStatus = U_ZERO_ERROR; - fStack = new UVector32(fDeferredStatus); - fData = fSmallData; - fWordBreakItr = NULL; - fTransparentBounds = FALSE; - fAnchoringBounds = TRUE; + fDeferredStatus = U_ZERO_ERROR; + init(fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return; + } if (pat==NULL) { fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; return; } - if (pat->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { - fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t)); - } - if (fStack == NULL || fData == NULL) { - fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; - } - - reset(RegexStaticSets::gStaticSets->fEmptyString); + fPattern = pat; + init2(RegexStaticSets::gStaticSets->fEmptyString, fDeferredStatus); } RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, uint32_t flags, UErrorCode &status) { - UParseError pe; - fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); - fPattern = fPatternOwned; - fTraceDebug = FALSE; - fDeferredStatus = U_ZERO_ERROR; - fStack = new UVector32(status); - fData = fSmallData; - fWordBreakItr = NULL; - fTransparentBounds = FALSE; - fAnchoringBounds = TRUE; + init(status); if (U_FAILURE(status)) { return; } - if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { - fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); - } - if (fStack == NULL || fData == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - } - reset(input); + UParseError pe; + fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); + fPattern = fPatternOwned; + init2(input, status); } RegexMatcher::RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status) { - UParseError pe; - fTraceDebug = FALSE; - fDeferredStatus = U_ZERO_ERROR; - fStack = new UVector32(status); - fData = fSmallData; - fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); - fPattern = fPatternOwned; - fWordBreakItr = NULL; - fTransparentBounds = FALSE; - fAnchoringBounds = TRUE; + init(status); if (U_FAILURE(status)) { return; } - - if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { - fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); - } - if (fStack == NULL || fData == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - } - reset(RegexStaticSets::gStaticSets->fEmptyString); + UParseError pe; + fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); + fPattern = fPatternOwned; + init2(RegexStaticSets::gStaticSets->fEmptyString, status); } + RegexMatcher::~RegexMatcher() { delete fStack; if (fData != fSmallData) { @@ -130,6 +109,79 @@ RegexMatcher::~RegexMatcher() { #endif } +// +// init() common initialization for use by all constructors. +// Initialize all fields, get the object into a consistent state. +// This must be done even when the initial status shows an error, +// so that the object is initialized sufficiently well for the destructor +// to run safely. +// +void RegexMatcher::init(UErrorCode &status) { + fPattern = NULL; + fPatternOwned = NULL; + fInput = NULL; + fFrameSize = 0; + fRegionStart = 0; + fRegionLimit = 0; + fAnchorStart = 0; + fAnchorLimit = 0; + fLookStart = 0; + fLookLimit = 0; + fActiveStart = 0; + fActiveLimit = 0; + fTransparentBounds = FALSE; + fAnchoringBounds = TRUE; + fMatch = FALSE; + fMatchStart = 0; + fMatchEnd = 0; + fLastMatchEnd = -1; + fAppendPosition = 0; + fHitEnd = FALSE; + fRequireEnd = FALSE; + fStack = NULL; + fFrame = NULL; + fTimeLimit = 0; + fTime = 0; + fTickCounter = 0; + fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; + fCallbackFn = NULL; + fCallbackContext = NULL; + fTraceDebug = FALSE; + fDeferredStatus = status; + fData = fSmallData; + fWordBreakItr = NULL; + + fStack = new UVector32(status); + if (U_FAILURE(status)) { + fDeferredStatus = status; + } +} + +// +// init2() Common initialization for use by RegexMatcher constructors, part 2. +// This handles the common setup to be done after the Pattern is available. +// +void RegexMatcher::init2(const UnicodeString &input, UErrorCode &status) { + if (U_FAILURE(status)) { + fDeferredStatus = status; + return; + } + + if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { + fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); + if (fData == NULL) { + status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; + return; + } + } + + reset(input); + setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); + if (U_FAILURE(status)) { + fDeferredStatus = status; + return; + } +} static const UChar BACKSLASH = 0x5c; @@ -856,6 +908,8 @@ void RegexMatcher::resetPreserveRegion() { fMatch = FALSE; fHitEnd = FALSE; fRequireEnd = FALSE; + fTime = 0; + fTickCounter = TIMER_INITIAL_VALUE; resetStack(); } @@ -1067,6 +1121,118 @@ RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { return *this; } +//-------------------------------------------------------------------------------- +// +// setTimeLimit +// +//-------------------------------------------------------------------------------- +void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return; + } + if (limit < 0) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + fTimeLimit = limit; +} + + +//-------------------------------------------------------------------------------- +// +// getTimeLimit +// +//-------------------------------------------------------------------------------- +int32_t RegexMatcher::getTimeLimit() const { + return fTimeLimit; +} + + +//-------------------------------------------------------------------------------- +// +// setStackLimit +// +//-------------------------------------------------------------------------------- +void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return; + } + if (limit < 0) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + // Reset the matcher. This is needed here in case there is a current match + // whose final stack frame (containing the match results, pointed to by fFrame) + // would be lost by resizing to a smaller stack size. + reset(); + + if (limit == 0) { + // Unlimited stack expansion + fStack->setMaxCapacity(0); + } else { + // Change the units of the limit from bytes to ints, and bump the size up + // to be big enough to hold at least one stack frame for the pattern, + // if it isn't there already. + int32_t adjustedLimit = limit / sizeof(int32_t); + if (adjustedLimit < fPattern->fFrameSize) { + adjustedLimit = fPattern->fFrameSize; + } + fStack->setMaxCapacity(adjustedLimit); + } + fStackLimit = limit; +} + + +//-------------------------------------------------------------------------------- +// +// getStackLimit +// +//-------------------------------------------------------------------------------- +int32_t RegexMatcher::getStackLimit() const { + return fStackLimit; +} + + +//-------------------------------------------------------------------------------- +// +// setMatchCallback +// +//-------------------------------------------------------------------------------- +void RegexMatcher::setMatchCallback(URegexMatchCallback callback, + const void *context, + UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + fCallbackFn = callback; + fCallbackContext = context; +} + + +//-------------------------------------------------------------------------------- +// +// getMatchCallback +// +//-------------------------------------------------------------------------------- +void RegexMatcher::getMatchCallback(URegexMatchCallback &callback, + const void *&context, + UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + callback = fCallbackFn; + context = fCallbackContext; +} + //================================================================================ // @@ -1187,6 +1353,31 @@ UBool RegexMatcher::isUWordBoundary(int32_t pos) { return returnVal; } +//-------------------------------------------------------------------------------- +// +// IncrementTime This function is called once each TIMER_INITIAL_VALUE state +// saves. Increment the "time" counter, and call the +// user callback function if there is one installed. +// +// If the match operation needs to be aborted, either for a time-out +// or because the user callback asked for it, just set an error status. +// The engine will pick that up and stop in its outer loop. +// +//-------------------------------------------------------------------------------- +void RegexMatcher::IncrementTime(UErrorCode &status) { + fTickCounter = TIMER_INITIAL_VALUE; + fTime++; + if (fCallbackFn != NULL) { + if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { + status = U_REGEX_STOPPED_BY_CALLER; + return; + } + } + if (fTimeLimit > 0 && fTime >= fTimeLimit) { + status = U_REGEX_TIME_OUT; + } +} + //-------------------------------------------------------------------------------- // // StateSave @@ -1196,13 +1387,33 @@ UBool RegexMatcher::isUWordBoundary(int32_t pos) { // the newly created stack frame // // Note that reserveBlock() may grow the stack, resulting in the -// whole thing being relocated in memory. +// whole thing being relocated in memory. +// +// Parameters: +// fp The top frame pointer when called. At return, a new +// fame will be present +// savePatIdx An index into the compiled pattern. Goes into the original +// (not new) frame. If execution ever back-tracks out of the +// new frame, this will be where we continue from in the pattern. +// Return +// The new frame pointer. // //-------------------------------------------------------------------------------- -inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, int32_t frameSize, UErrorCode &status) { +inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status) { // push storage for a new frame. - int32_t *newFP = fStack->reserveBlock(frameSize, status); - fp = (REStackFrame *)(newFP - frameSize); // in case of realloc of stack. + int32_t *newFP = fStack->reserveBlock(fFrameSize, status); + if (newFP == NULL) { + // Failure on attempted stack expansion. + // Stack function set some other error code, change it to a more + // specific one for regular expressions. + status = U_REGEX_STACK_OVERFLOW; + // We need to return a writable stack frame, so just return the + // previous frame. The match operation will stop quickly + // becuase of the error status, after which the frame will never + // be looked at again. + return fp; + } + fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. // New stack frame = copy of old top frame. int32_t *source = (int32_t *)fp; @@ -1214,6 +1425,10 @@ inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatId } } + fTickCounter--; + if (fTickCounter <= 0) { + IncrementTime(status); // Re-initializes fTickCounter + } fp->fPatIdx = savePatIdx; return (REStackFrame *)newFP; } @@ -1262,7 +1477,6 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { } // Cache frequently referenced items from the compiled pattern - // in local variables. // int32_t *pat = fPattern->fCompiledPat->getBuffer(); @@ -1271,8 +1485,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { const UChar *inputBuf = fInput->getBuffer(); + fFrameSize = fPattern->fFrameSize; REStackFrame *fp = resetStack(); - int32_t frameSize = fPattern->fFrameSize; fp->fPatIdx = 0; fp->fInputIdx = startIdx; @@ -1316,7 +1530,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { // Force a backtrack. In some circumstances, the pattern compiler // will notice that the pattern can't possibly match anything, and will // emit one of these at that point. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; @@ -1330,7 +1544,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { } else { fHitEnd = TRUE; } - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; @@ -1352,7 +1566,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { if (fp->fInputIdx + stringLen > fActiveLimit) { // No match. String is longer than the remaining input text. fHitEnd = TRUE; // TODO: See ticket 6074 - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } @@ -1370,7 +1584,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { } } else { // Match failed. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } } @@ -1380,7 +1594,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { case URX_STATE_SAVE: - fp = StateSave(fp, opValue, frameSize, status); + fp = StateSave(fp, opValue, status); break; @@ -1389,7 +1603,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { // when we reach the end of the pattern. if (toEnd && fp->fInputIdx != fActiveLimit) { // The pattern matched, but not to the end of input. Try some more. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } isMatch = TRUE; @@ -1401,26 +1615,26 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { // opValue+2 - the start of a capture group whose end // has not yet been reached (and might not ever be). case URX_START_CAPTURE: - U_ASSERT(opValue >= 0 && opValue < frameSize-3); + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); fp->fExtra[opValue+2] = fp->fInputIdx; break; case URX_END_CAPTURE: - U_ASSERT(opValue >= 0 && opValue < frameSize-3); + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. fp->fExtra[opValue+1] = fp->fInputIdx; // End position U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); break; - + case URX_DOLLAR: // $, test for End of line // or for position before new line at end of input if (fp->fInputIdx < fAnchorLimit-2) { // We are no where near the end of input. Fail. // This is the common case. Keep it first. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } if (fp->fInputIdx >= fAnchorLimit) { @@ -1451,7 +1665,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { break; // At CR/LF at end of input. Success } - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; @@ -1475,7 +1689,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { } // Not at end of input. Back-track out. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; @@ -1499,7 +1713,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { } } // not at a new line. Fail. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; @@ -1515,7 +1729,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { // If we are not positioned just before a new-line, the test fails; backtrack out. // It makes no difference where the new-line is within the input. if (inputBuf[fp->fInputIdx] != 0x0a) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; @@ -1523,7 +1737,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { case URX_CARET: // ^, test for start of line if (fp->fInputIdx != fAnchorStart) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; @@ -1544,7 +1758,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { break; } // Not at the start of a line. Fail. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; @@ -1561,7 +1775,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { UChar c = inputBuf[fp->fInputIdx - 1]; if (c != 0x0a) { // Not at the start of a line. Back-track out. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; @@ -1571,7 +1785,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { UBool success = isWordBoundary(fp->fInputIdx); success ^= (opValue != 0); // flip sense for \B if (!success) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; @@ -1582,7 +1796,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { UBool success = isUWordBoundary(fp->fInputIdx); success ^= (opValue != 0); // flip sense for \B if (!success) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; @@ -1592,7 +1806,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { { if (fp->fInputIdx >= fActiveLimit) { fHitEnd = TRUE; - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } @@ -1603,7 +1817,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { if (success) { fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1); } else { - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; @@ -1611,7 +1825,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { case URX_BACKSLASH_G: // Test for position at end of previous match if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; @@ -1625,7 +1839,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { // Fail if at end of input if (fp->fInputIdx >= fActiveLimit) { fHitEnd = TRUE; - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } @@ -1703,7 +1917,7 @@ GC_Done: case URX_BACKSLASH_Z: // Test for end of Input if (fp->fInputIdx < fAnchorLimit) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } else { fHitEnd = TRUE; fRequireEnd = TRUE; @@ -1721,7 +1935,7 @@ GC_Done: // 1: success if input char is not in set. if (fp->fInputIdx >= fActiveLimit) { fHitEnd = TRUE; - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } @@ -1742,7 +1956,7 @@ GC_Done: } } if (!success) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; @@ -1754,7 +1968,7 @@ GC_Done: // the predefined sets (Word Characters, for example) if (fp->fInputIdx >= fActiveLimit) { fHitEnd = TRUE; - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } @@ -1773,7 +1987,7 @@ GC_Done: } } - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; @@ -1781,7 +1995,7 @@ GC_Done: case URX_SETREF: if (fp->fInputIdx >= fActiveLimit) { fHitEnd = TRUE; - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } // There is input left. Pick up one char and test it for set membership. @@ -1801,7 +2015,7 @@ GC_Done: } } // the character wasn't in the set. Back track out. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; @@ -1811,7 +2025,7 @@ GC_Done: if (fp->fInputIdx >= fActiveLimit) { // At end of input. Match failed. Backtrack out. fHitEnd = TRUE; - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } // There is input left. Advance over one char, unless we've hit end-of-line @@ -1820,7 +2034,7 @@ GC_Done: if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { // End of line in normal mode. . does not match. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } } @@ -1833,7 +2047,7 @@ GC_Done: if (fp->fInputIdx >= fActiveLimit) { // At end of input. Match failed. Backtrack out. fHitEnd = TRUE; - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } // There is input left. Advance over one char, except if we are @@ -1858,7 +2072,7 @@ GC_Done: if (fp->fInputIdx >= fActiveLimit) { // At end of input. Match failed. Backtrack out. fHitEnd = TRUE; - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } // There is input left. Advance over one char, unless we've hit end-of-line @@ -1866,7 +2080,7 @@ GC_Done: U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c == 0x0a) { // End of line in normal mode. '.' does not match the \n - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; @@ -1882,8 +2096,8 @@ GC_Done: case URX_JMP_SAV: U_ASSERT(opValue < fPattern->fCompiledPat->size()); - fp = StateSave(fp, fp->fPatIdx, frameSize, status); // State save to loc following current - fp->fPatIdx = opValue; // Then JMP. + fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current + fp->fPatIdx = opValue; // Then JMP. break; case URX_JMP_SAV_X: @@ -1896,12 +2110,12 @@ GC_Done: int32_t stoOp = pat[opValue-1]; U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); int32_t frameLoc = URX_VAL(stoOp); - U_ASSERT(frameLoc >= 0 && frameLoc < frameSize); + U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); int32_t prevInputIdx = fp->fExtra[frameLoc]; U_ASSERT(prevInputIdx <= fp->fInputIdx); if (prevInputIdx < fp->fInputIdx) { // The match did make progress. Repeat the loop. - fp = StateSave(fp, fp->fPatIdx, frameSize, status); // State save to loc following current + fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current fp->fPatIdx = opValue; fp->fExtra[frameLoc] = fp->fInputIdx; } @@ -1912,7 +2126,7 @@ GC_Done: case URX_CTR_INIT: { - U_ASSERT(opValue >= 0 && opValue < frameSize-2); + U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); fp->fExtra[opValue] = 0; // Set the loop counter variable to zero // Pick up the three extra operands that CTR_INIT has, and @@ -1927,10 +2141,10 @@ GC_Done: U_ASSERT(loopLoc>fp->fPatIdx); if (minCount == 0) { - fp = StateSave(fp, loopLoc+1, frameSize, status); + fp = StateSave(fp, loopLoc+1, status); } if (maxCount == 0) { - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; @@ -1953,7 +2167,7 @@ GC_Done: break; } if (*pCounter >= minCount) { - fp = StateSave(fp, fp->fPatIdx, frameSize, status); + fp = StateSave(fp, fp->fPatIdx, status); } fp->fPatIdx = opValue + 4; // Loop back. } @@ -1962,7 +2176,7 @@ GC_Done: case URX_CTR_INIT_NG: { // Initialize a non-greedy loop - U_ASSERT(opValue >= 0 && opValue < frameSize-2); + U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); fp->fExtra[opValue] = 0; // Set the loop counter variable to zero // Pick up the three extra operands that CTR_INIT has, and @@ -1978,7 +2192,7 @@ GC_Done: if (minCount == 0) { if (maxCount != 0) { - fp = StateSave(fp, fp->fPatIdx, frameSize, status); + fp = StateSave(fp, fp->fPatIdx, status); } fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block } @@ -2017,7 +2231,7 @@ GC_Done: // Fall into the following pattern, but first do // a state save to the top of the loop, so that a failure // in the following pattern will try another iteration of the loop. - fp = StateSave(fp, opValue + 4, frameSize, status); + fp = StateSave(fp, opValue + 4, status); } } break; @@ -2032,12 +2246,12 @@ GC_Done: U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); int32_t newStackSize = fData[opValue]; U_ASSERT(newStackSize <= fStack->size()); - int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize; + int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; if (newFP == (int32_t *)fp) { break; } int32_t i; - for (i=0; ifExtra[opValue]; int32_t groupEndIdx = fp->fExtra[opValue+1]; U_ASSERT(groupStartIdx <= groupEndIdx); int32_t len = groupEndIdx-groupStartIdx; if (groupStartIdx < 0) { // This capture group has not participated in the match thus far, - fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. } if (len == 0) { @@ -2085,14 +2299,14 @@ GC_Done: if (haveMatch) { fp->fInputIdx += len; // Match. Advance current input position. } else { - fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. } } break; case URX_STO_INP_LOC: { - U_ASSERT(opValue >= 0 && opValue < frameSize); + U_ASSERT(opValue >= 0 && opValue < fFrameSize); fp->fExtra[opValue] = fp->fInputIdx; } break; @@ -2102,13 +2316,13 @@ GC_Done: int32_t instrOperandLoc = fp->fPatIdx; fp->fPatIdx += 1; int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); - U_ASSERT(dataLoc >= 0 && dataLoc < frameSize); + U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); int32_t savedInputIdx = fp->fExtra[dataLoc]; U_ASSERT(savedInputIdx <= fp->fInputIdx); if (savedInputIdx < fp->fInputIdx) { fp->fPatIdx = opValue; // JMP } else { - fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no progress in loop. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. } } break; @@ -2137,9 +2351,9 @@ GC_Done: // Copy the current top frame back to the new (cut back) top frame. // This makes the capture groups from within the look-ahead // expression available. - int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize; + int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; int32_t i; - for (i=0; ipopFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; case URX_STRING_I: @@ -2195,7 +2409,7 @@ GC_Done: fHitEnd = TRUE; // See ticket 6074 } // No match. Back up matching to a saved state - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; @@ -2249,7 +2463,7 @@ GC_Done: // We have tried all potential match starting points without // getting a match. Backtrack out, and out of the // Look Behind altogether. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); int32_t restoreInputLen = fData[opValue+3]; U_ASSERT(restoreInputLen >= fActiveLimit); U_ASSERT(restoreInputLen <= fInput->length()); @@ -2259,7 +2473,7 @@ GC_Done: // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. // (successful match will fall off the end of the loop.) - fp = StateSave(fp, fp->fPatIdx-3, frameSize, status); + fp = StateSave(fp, fp->fPatIdx-3, status); fp->fInputIdx = *lbStartIdx; } break; @@ -2274,7 +2488,7 @@ GC_Done: // FAIL out of here, which will take us back to the LB_CONT, which // will retry the match starting at another position or fail // the look-behind altogether, whichever is appropriate. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } @@ -2333,7 +2547,7 @@ GC_Done: // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. // (successful match will cause a FAIL out of the loop altogether.) - fp = StateSave(fp, fp->fPatIdx-4, frameSize, status); + fp = StateSave(fp, fp->fPatIdx-4, status); fp->fInputIdx = *lbStartIdx; } break; @@ -2348,7 +2562,7 @@ GC_Done: // FAIL out of here, which will take us back to the LB_CONT, which // will retry the match starting at another position or succeed // the look-behind altogether, whichever is appropriate. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } @@ -2372,7 +2586,7 @@ GC_Done: // FAIL, which will take control back to someplace // prior to entering the look-behind test. - fp = (REStackFrame *)fStack->popFrame(frameSize); + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; @@ -2423,14 +2637,14 @@ GC_Done: int32_t loopcOp = pat[fp->fPatIdx]; U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); int32_t stackLoc = URX_VAL(loopcOp); - U_ASSERT(stackLoc >= 0 && stackLoc < frameSize); + U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); fp->fExtra[stackLoc] = fp->fInputIdx; fp->fInputIdx = ix; // Save State to the URX_LOOP_C op that follows this one, // so that match failures in the following code will return to there. // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. - fp = StateSave(fp, fp->fPatIdx, frameSize, status); + fp = StateSave(fp, fp->fPatIdx, status); fp->fPatIdx++; } break; @@ -2486,14 +2700,14 @@ GC_Done: int32_t loopcOp = pat[fp->fPatIdx]; U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); int32_t stackLoc = URX_VAL(loopcOp); - U_ASSERT(stackLoc >= 0 && stackLoc < frameSize); + U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); fp->fExtra[stackLoc] = fp->fInputIdx; fp->fInputIdx = ix; // Save State to the URX_LOOP_C op that follows this one, // so that match failures in the following code will return to there. // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. - fp = StateSave(fp, fp->fPatIdx, frameSize, status); + fp = StateSave(fp, fp->fPatIdx, status); fp->fPatIdx++; } break; @@ -2501,7 +2715,7 @@ GC_Done: case URX_LOOP_C: { - U_ASSERT(opValue>=0 && opValue=0 && opValuefExtra[opValue]; U_ASSERT(terminalIdx <= fp->fInputIdx); if (terminalIdx == fp->fInputIdx) { @@ -2528,7 +2742,7 @@ GC_Done: } - fp = StateSave(fp, fp->fPatIdx-1, frameSize, status); + fp = StateSave(fp, fp->fPatIdx-1, status); } break; @@ -2541,6 +2755,7 @@ GC_Done: } if (U_FAILURE(status)) { + isMatch = FALSE; break; } } diff --git a/icu4c/source/i18n/unicode/regex.h b/icu4c/source/i18n/unicode/regex.h index 0e45918ab4b..a08b1898ad5 100644 --- a/icu4c/source/i18n/unicode/regex.h +++ b/icu4c/source/i18n/unicode/regex.h @@ -1003,6 +1003,102 @@ public: int32_t destCapacity, UErrorCode &status); + /** + * Set a processing time limit for match operations with this Matcher. + * + * Some patterns, when matching certain strings, can run in exponential time. + * For practical purposes, the match operation may appear to be in an + * infinite loop. + * When a limit is set a match operation will fail with an error if the + * limit is exceeded. + *

+ * The units of the limit are steps of the match engine. + * Correspondence with actual processor time will depend on the speed + * of the processor and the details of the specific pattern, but will + * typically be on the order of milliseconds. + *

+ * By default, the matching time is not limited. + *

+ * + * @param limit The limit value, or 0 for no limit. + * @param status A reference to a UErrorCode to receive any errors. + * @draft ICU 4.0 + */ + virtual void setTimeLimit(int32_t limit, UErrorCode &status); + + /** + * Get the time limit, if any, for match operations made with this Matcher. + * + * @return the maximum allowed time for a match, in units of processing steps. + * @draft ICU 4.0 + */ + virtual int32_t getTimeLimit() const; + + /** + * Set the amount of heap storage avaliable for use by the match backtracking stack. + * The matcher is also reset, discarding any results from previous matches. + *

+ * ICU uses a backtracking regular expression engine, with the backtrack stack + * maintained on the heap. This function sets the limit to the amount of memory + * that can be used for this purpose. A backtracking stack overflow will + * result in an error from the match operation that caused it. + *

+ * A limit is desirable because a malicious or poorly designed pattern can use + * excessive memory, potentially crashing the process. A limit is enabled + * by default. + *

+ * @param limit The maximum size, in bytes, of the matching backtrack stack. + * A value of zero means no limit. + * The limit must be greater or equal to zero. + * + * @param status A reference to a UErrorCode to receive any errors. + * + * @draft ICU 4.0 + */ + virtual void setStackLimit(int32_t limit, UErrorCode &status); + + /** + * Get the size of the heap storage available for use by the back tracking stack. + * + * @return the maximum backtracking stack size, in bytes, or zero if the + * stack size is unlimited. + * @draft ICU 4.0 + */ + virtual int32_t getStackLimit() const; + + + /** + * Set a callback function for use with this Matcher. + * During matching operations the function will be called periodically, + * giving the application the opportunity to terminate a long-running + * match. + * + * @param callback A pointer to the user-supplied callback function. + * @param context User context pointer. The value supplied at the + * time the callback function is set will be saved + * and passed to the callback each time that it is called. + * @param status A reference to a UErrorCode to receive any errors. + * @draft ICU 4.0 + */ + virtual void setMatchCallback(URegexMatchCallback callback, + const void *context, + UErrorCode &status); + + + + /** + * Get the callback function for this URegularExpression. + * + * @param callback Out paramater, receives a pointer to the user-supplied + * callback function. + * @param context Out parameter, receives the user context pointer that + * was set when uregex_setMatchCallback() was called. + * @param status A reference to a UErrorCode to receive any errors. + * @draft ICU 4.0 + */ + virtual void getMatchCallback(URegexMatchCallback &callback, + const void *&context, + UErrorCode &status); /** @@ -1030,10 +1126,13 @@ public: private: // Constructors and other object boilerplate are private. // Instances of RegexMatcher can not be assigned, copied, cloned, etc. - RegexMatcher(); // default constructor not implemented + RegexMatcher(); // default constructor not implemented RegexMatcher(const RegexPattern *pat); RegexMatcher(const RegexMatcher &other); RegexMatcher &operator =(const RegexMatcher &rhs); + void init(UErrorCode &status); // Common initialization + void init2(const UnicodeString &s, UErrorCode &e); // Common initialization, part 2. + friend class RegexPattern; friend class RegexCImpl; public: @@ -1050,8 +1149,8 @@ private: UBool isWordBoundary(int32_t pos); // perform Perl-like \b test UBool isUWordBoundary(int32_t pos); // perform RBBI based \b test REStackFrame *resetStack(); - inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx, - int32_t frameSize, UErrorCode &status); + inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status); + void IncrementTime(UErrorCode &status); const RegexPattern *fPattern; @@ -1059,6 +1158,7 @@ private: // should delete it when through. const UnicodeString *fInput; // The text being matched. Is never NULL. + int32_t fFrameSize; // The size of a frame in the backtrack stack. int32_t fRegionStart; // Start of the input region, default = 0. int32_t fRegionLimit; // End of input region, default to input.length. @@ -1101,9 +1201,25 @@ private: int32_t *fData; // Data area for use by the compiled pattern. int32_t fSmallData[8]; // Use this for data if it's enough. + int32_t fTimeLimit; // Max time (in arbitrary steps) to let the + // match engine run. Zero for unlimited. + + int32_t fTime; // Match time, accumulates while matching. + int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves. + // Kept separately from fTime to keep as much + // code as possible out of the inline + // StateSave function. + + int32_t fStackLimit; // Maximum memory size to use for the backtrack + // stack, in bytes. Zero for unlimited. + + URegexMatchCallback fCallbackFn; // Pointer to match progress callback funct. + // NULL if there is no callback. + const void *fCallbackContext; // User Context ptr for callback function. + UBool fTraceDebug; // Set true for debug tracing of match engine. - UErrorCode fDeferredStatus; // Save error state if that cannot be immediately + UErrorCode fDeferredStatus; // Save error state that cannot be immediately // reported, or that permanently disables this matcher. RuleBasedBreakIterator *fWordBreakItr; diff --git a/icu4c/source/i18n/unicode/uregex.h b/icu4c/source/i18n/unicode/uregex.h index d24c836623e..fb808aacff1 100644 --- a/icu4c/source/i18n/unicode/uregex.h +++ b/icu4c/source/i18n/unicode/uregex.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2004-2007, International Business Machines +* Copyright (C) 2004-2008, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: regex.h @@ -843,5 +843,145 @@ uregex_split( URegularExpression *regexp, + + /** + * Set a processing time limit for match operations with this URegularExpression. + * + * Some patterns, when matching certain strings, can run in exponential time. + * For practical purposes, the match operation may appear to be in an + * infinite loop. + * When a limit is set a match operation will fail with an error if the + * limit is exceeded. + *

+ * The units of the limit are steps of the match engine. + * Correspondence with actual processor time will depend on the speed + * of the processor and the details of the specific pattern, but will + * typically be on the order of milliseconds. + *

+ * By default, the matching time is not limited. + *

+ * + * @param regexp The compiled regular expression. + * @param limit The limit value, or 0 for no limit. + * @param status A reference to a UErrorCode to receive any errors. + * @draft ICU 4.0 + */ +U_DRAFT void U_EXPORT2 +uregex_setTimeLimit(URegularExpression *regexp, + int32_t limit, + UErrorCode *status); + + /** + * Get the time limit for for matches with this URegularExpression. + * A return value of zero indicates that there is no limit. + * + * @param regexp The compiled regular expression. + * @param status A reference to a UErrorCode to receive any errors. + * @return the maximum allowed time for a match, in units of processing steps. + * @draft ICU 4.0 + */ +U_DRAFT int32_t U_EXPORT2 +uregex_getTimeLimit(const URegularExpression *regexp, + UErrorCode *status); + + /** + * Set the amount of heap storage avaliable for use by the match backtracking stack. + *

+ * ICU uses a backtracking regular expression engine, with the backtrack stack + * maintained on the heap. This function sets the limit to the amount of memory + * that can be used for this purpose. A backtracking stack overflow will + * result in an error from the match operation that caused it. + *

+ * A limit is desirable because a malicious or poorly designed pattern can use + * excessive memory, potentially crashing the process. A limit is enabled + * by default. + *

+ * @param regexp The compiled regular expression. + * @param limit The maximum size, in bytes, of the matching backtrack stack. + * A value of -1 means no limit. + * The limit must be greater than zero, or -1. + * @param status A reference to a UErrorCode to receive any errors. + * + * @draft ICU 4.0 + */ +U_DRAFT void U_EXPORT2 +uregex_setStackLimit(URegularExpression *regexp, + int32_t limit, + UErrorCode *status); + + /** + * Get the size of the heap storage available for use by the back tracking stack. + * + * @return the maximum backtracking stack size, in bytes, or zero if the + * stack size is unlimited. + * @draft ICU 4.0 + */ +U_DRAFT int32_t U_EXPORT2 +uregex_getStackLimit(const URegularExpression *regexp, + UErrorCode *status); + + +/** + * Function pointer for a regular expression matching callback function. + * When set, a callback function will be called periodically during matching + * operations. If the call back function returns FALSE, the matching + * operation will be terminated early. + * + * Note: the callback function must not call other functions on this + * URegularExpression. + * + * @param context context pointer. The callback function will be invoked + * with the context specified at the time that + * uregex_setMatchCallback() is called. + * @param steps the accumulated processing time, in match steps, + * for this matching operation. + * @return TRUE to continue the matching operation. + * FALSE to terminate the matching operation. + * @draft ICU 4.0 + */ +typedef UBool (U_EXPORT2 *URegexMatchCallback) ( + const void *context, + int32_t steps); + +/** + * Set a callback function for this URegularExpression. + * During matching operations the function will be called periodically, + * giving the application the opportunity to terminate a long-running + * match. + * + * @param regexp The compiled regular expression. + * @param callback A pointer to the user-supplied callback function. + * @param context User context pointer. The value supplied at the + * time the callback function is set will be saved + * and passed to the callback each time that it is called. + * @param status A reference to a UErrorCode to receive any errors. + * @draft ICU 4.0 + */ +U_DRAFT void U_EXPORT2 +uregex_setMatchCallback(URegularExpression *regexp, + URegexMatchCallback callback, + const void *context, + UErrorCode *status); + + +/** + * Get the callback function for this URegularExpression. + * + * @param regexp The compiled regular expression. + * @param callback Out paramater, receives a pointer to the user-supplied + * callback function. + * @param context Out parameter, receives the user context pointer that + * was set when uregex_setMatchCallback() was called. + * @param status A reference to a UErrorCode to receive any errors. + * @draft ICU 4.0 + */ +U_DRAFT void U_EXPORT2 +uregex_getMatchCallback(const URegularExpression *regexp, + URegexMatchCallback *callback, + const void **context, + UErrorCode *status); + + + #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ #endif /* UREGEX_H */ diff --git a/icu4c/source/i18n/uregex.cpp b/icu4c/source/i18n/uregex.cpp index 7d86ce959d0..852a11e6282 100644 --- a/icu4c/source/i18n/uregex.cpp +++ b/icu4c/source/i18n/uregex.cpp @@ -74,7 +74,6 @@ static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool return FALSE; } if (re == NULL || re->fMagic != REXP_MAGIC) { - // U_ASSERT(FALSE); *status = U_ILLEGAL_ARGUMENT_ERROR; return FALSE; } @@ -630,6 +629,103 @@ uregex_requireEnd(const URegularExpression *regexp, } +//------------------------------------------------------------------------------ +// +// uregex_setTimeLimit +// +//------------------------------------------------------------------------------ +U_CAPI void U_EXPORT2 +uregex_setTimeLimit(URegularExpression *regexp, + int32_t limit, + UErrorCode *status) { + if (validateRE(regexp, status)) { + regexp->fMatcher->setTimeLimit(limit, *status); + } +} + + + +//------------------------------------------------------------------------------ +// +// uregex_getTimeLimit +// +//------------------------------------------------------------------------------ +U_CAPI int32_t U_EXPORT2 +uregex_getTimeLimit(const URegularExpression *regexp, + UErrorCode *status) { + int32_t retVal = 0; + if (validateRE(regexp, status)) { + retVal = regexp->fMatcher->getTimeLimit(); + } + return retVal; +} + + + +//------------------------------------------------------------------------------ +// +// uregex_setStackLimit +// +//------------------------------------------------------------------------------ +U_CAPI void U_EXPORT2 +uregex_setStackLimit(URegularExpression *regexp, + int32_t limit, + UErrorCode *status) { + if (validateRE(regexp, status)) { + regexp->fMatcher->setStackLimit(limit, *status); + } +} + + + +//------------------------------------------------------------------------------ +// +// uregex_getStackLimit +// +//------------------------------------------------------------------------------ +U_CAPI int32_t U_EXPORT2 +uregex_getStackLimit(const URegularExpression *regexp, + UErrorCode *status) { + int32_t retVal = 0; + if (validateRE(regexp, status)) { + retVal = regexp->fMatcher->getStackLimit(); + } + return retVal; +} + + +//------------------------------------------------------------------------------ +// +// uregex_setMatchCallback +// +//------------------------------------------------------------------------------ +U_CAPI void U_EXPORT2 +uregex_setMatchCallback(URegularExpression *regexp, + URegexMatchCallback callback, + const void *context, + UErrorCode *status) { + if (validateRE(regexp, status)) { + regexp->fMatcher->setMatchCallback(callback, context, *status); + } +} + + +//------------------------------------------------------------------------------ +// +// uregex_getMatchCallback +// +//------------------------------------------------------------------------------ +U_CAPI void U_EXPORT2 +uregex_getMatchCallback(const URegularExpression *regexp, + URegexMatchCallback *callback, + const void **context, + UErrorCode *status) { + if (validateRE(regexp, status)) { + regexp->fMatcher->getMatchCallback(*callback, *context, *status); + } +} + + //------------------------------------------------------------------------------ // // uregex_replaceAll diff --git a/icu4c/source/test/cintltst/reapits.c b/icu4c/source/test/cintltst/reapits.c index 532ff3c8ecf..c318e96d169 100644 --- a/icu4c/source/test/cintltst/reapits.c +++ b/icu4c/source/test/cintltst/reapits.c @@ -100,7 +100,31 @@ void addURegexTest(TestNode** root) addTest(root, &TestBug4315, "regex/TestBug4315"); } +/* + * Call back function and context struct used for testing + * regular expression user callbacks. This test is mostly the same as + * the corresponding C++ test in intltest. + */ +typedef struct callBackContext { + int32_t maxCalls; + int32_t numCalls; + int32_t lastSteps; +} callBackContext; +static UBool U_EXPORT2 U_CALLCONV +TestCallbackFn(const void *context, int32_t steps) { + callBackContext *info = (callBackContext *)context; + if (info->lastSteps+1 != steps) { + log_err("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps); + } + info->lastSteps = steps; + info->numCalls++; + return (info->numCalls < info->maxCalls); +} + +/* + * Regular Expression C API Tests + */ static void TestRegexCAPI(void) { UErrorCode status = U_ZERO_ERROR; URegularExpression *re; @@ -1144,8 +1168,72 @@ static void TestRegexCAPI(void) { uregex_close(re); } + /* + * set/getTimeLimit + */ + TEST_SETUP("abc$", "abcdef", 0); + TEST_ASSERT(uregex_getTimeLimit(re, &status) == 0); + uregex_setTimeLimit(re, 1000, &status); + TEST_ASSERT(uregex_getTimeLimit(re, &status) == 1000); + TEST_ASSERT_SUCCESS(status); + uregex_setTimeLimit(re, -1, &status); + TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); + status = U_ZERO_ERROR; + TEST_ASSERT(uregex_getTimeLimit(re, &status) == 1000); + TEST_TEARDOWN; + + /* + * set/get Stack Limit + */ + TEST_SETUP("abc$", "abcdef", 0); + TEST_ASSERT(uregex_getStackLimit(re, &status) == 8000000); + uregex_setStackLimit(re, 40000, &status); + TEST_ASSERT(uregex_getStackLimit(re, &status) == 40000); + TEST_ASSERT_SUCCESS(status); + uregex_setStackLimit(re, -1, &status); + TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); + status = U_ZERO_ERROR; + TEST_ASSERT(uregex_getStackLimit(re, &status) == 40000); + TEST_TEARDOWN; + + + /* + * Get/Set callback functions + * This test is copied from intltest regex/Callbacks + * The pattern and test data will run long enough to cause the callback + * to be invoked. The nested '+' operators give exponential time + * behavior with increasing string length. + */ + TEST_SETUP("((.)+\\2)+x", "aaaaaaaaaaaaaaaaaaab", 0) + callBackContext cbInfo = {4, 0, 0}; + const void *pContext = &cbInfo; + URegexMatchCallback returnedFn = &TestCallbackFn; + + /* Getting the callback fn when it hasn't been set must return NULL */ + uregex_getMatchCallback(re, &returnedFn, &pContext, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(returnedFn == NULL); + TEST_ASSERT(pContext == NULL); + + /* Set thecallback and do a match. */ + /* The callback function should record that it has been called. */ + uregex_setMatchCallback(re, &TestCallbackFn, &cbInfo, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(cbInfo.numCalls == 0); + TEST_ASSERT(uregex_matches(re, -1, &status) == FALSE); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(cbInfo.numCalls > 0); + + /* Getting the callback should return the values that were set above. */ + uregex_getMatchCallback(re, &returnedFn, &pContext, &status); + TEST_ASSERT(returnedFn == &TestCallbackFn); + TEST_ASSERT(pContext == &cbInfo); + + TEST_TEARDOWN; } + + static void TestBug4315(void) { UErrorCode theICUError = U_ZERO_ERROR; URegularExpression *theRegEx; diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index 6ab96db0772..085432f0cd2 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2002-2007, International Business Machines Corporation and + * Copyright (c) 2002-2008, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -66,7 +66,9 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch case 6: name = "PerlTests"; if (exec) PerlTests(); break; - + case 7: name = "Callbacks"; + if (exec) Callbacks(); + break; default: name = ""; break; //needed to end loop @@ -837,6 +839,90 @@ void RegexTest::API_Match() { } #endif + // + // Time Outs. + // Note: These tests will need to be changed when the regexp engine is + // able to detect and cut short the exponential time behavior on + // this type of match. + // + { + UErrorCode status = U_ZERO_ERROR; + // Enough 'a's in the string to cause the match to time out. + // (Each on additonal 'a' doubles the time) + UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa"); + RegexMatcher matcher("(a+)+b", testString, 0, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher.getTimeLimit() == 0); + matcher.setTimeLimit(100, status); + REGEX_ASSERT(matcher.getTimeLimit() == 100); + REGEX_ASSERT(matcher.lookingAt(status) == FALSE); + REGEX_ASSERT(status == U_REGEX_TIME_OUT); + } + { + UErrorCode status = U_ZERO_ERROR; + // Few enough 'a's to slip in under the time limit. + UnicodeString testString("aaaaaaaaaaaaaaaaaa"); + RegexMatcher matcher("(a+)+b", testString, 0, status); + REGEX_CHECK_STATUS; + matcher.setTimeLimit(100, status); + REGEX_ASSERT(matcher.lookingAt(status) == FALSE); + REGEX_CHECK_STATUS; + } + + // + // Stack Limits + // + { + UErrorCode status = U_ZERO_ERROR; + UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A' + + // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations + // of the '+', and makes the stack frames larger. + RegexMatcher matcher("(A)+A$", testString, 0, status); + + // With the default stack, this match should fail to run + REGEX_ASSERT(matcher.lookingAt(status) == FALSE); + REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); + + // With unlimited stack, it should run + status = U_ZERO_ERROR; + matcher.setStackLimit(0, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher.lookingAt(status) == TRUE); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher.getStackLimit() == 0); + + // With a limited stack, it the match should fail + status = U_ZERO_ERROR; + matcher.setStackLimit(10000, status); + REGEX_ASSERT(matcher.lookingAt(status) == FALSE); + REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); + REGEX_ASSERT(matcher.getStackLimit() == 10000); + } + + // A pattern that doesn't save state should work with + // a minimal sized stack + { + UErrorCode status = U_ZERO_ERROR; + UnicodeString testString = "abc"; + RegexMatcher matcher("abc", testString, 0, status); + REGEX_CHECK_STATUS; + matcher.setStackLimit(30, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher.matches(status) == TRUE); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher.getStackLimit() == 30); + + // Negative stack sizes should fail + status = U_ZERO_ERROR; + matcher.setStackLimit(1000, status); + REGEX_CHECK_STATUS; + matcher.setStackLimit(-1, status); + REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); + REGEX_ASSERT(matcher.getStackLimit() == 1000); + } + + } @@ -2299,6 +2385,98 @@ void RegexTest::PerlTests() { } +// +// Callbacks() Test the callback function. +// When set, callbacks occur periodically during matching operations, +// giving the application code the ability to abort the operation +// before it's normal completion. +// + +struct callBackContext { + RegexTest *test; + int32_t maxCalls; + int32_t numCalls; + int32_t lastSteps; + void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}; +}; + +U_CDECL_BEGIN +static UBool U_CALLCONV +testCallBackFn(const void *context, int32_t steps) { + callBackContext *info = (callBackContext *)context; + if (info->lastSteps+1 != steps) { + info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps); + } + info->lastSteps = steps; + info->numCalls++; + return (info->numCalls < info->maxCalls); +} +U_CDECL_END + +void RegexTest::Callbacks() { + { + // Getter returns NULLs if no callback has been set + + // The variables that the getter will fill in. + // Init to non-null values so that the action of the getter can be seen. + const void *returnedContext = &returnedContext; + URegexMatchCallback returnedFn = &testCallBackFn; + + UErrorCode status = U_ZERO_ERROR; + RegexMatcher matcher("x", 0, status); + REGEX_CHECK_STATUS; + matcher.getMatchCallback(returnedFn, returnedContext, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(returnedFn == NULL); + REGEX_ASSERT(returnedContext == NULL); + } + + { + // Set and Get work + callBackContext cbInfo = {this, 0, 0, 0}; + const void *returnedContext; + URegexMatchCallback returnedFn; + UErrorCode status = U_ZERO_ERROR; + RegexMatcher matcher("((.)+\\2)+x", 0, status); // A pattern that can run long. + REGEX_CHECK_STATUS; + matcher.setMatchCallback(testCallBackFn, &cbInfo, status); + REGEX_CHECK_STATUS; + matcher.getMatchCallback(returnedFn, returnedContext, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(returnedFn == &testCallBackFn); + REGEX_ASSERT(returnedContext == &cbInfo); + + // A short-running match shouldn't invoke the callback + status = U_ZERO_ERROR; + cbInfo.reset(1); + UnicodeString s = "xxx"; + matcher.reset(s); + REGEX_ASSERT(matcher.matches(status)); + REGEX_CHECK_STATUS; + REGEX_ASSERT(cbInfo.numCalls == 0); + + // A medium-length match that runs long enough to invoke the + // callback, but not so long that the callback aborts it. + status = U_ZERO_ERROR; + cbInfo.reset(4); + s = "aaaaaaaaaaaaaaaaaaab"; + matcher.reset(s); + REGEX_ASSERT(matcher.matches(status)==FALSE); + REGEX_CHECK_STATUS; + REGEX_ASSERT(cbInfo.numCalls > 0); + + // A longer running match that the callback function will abort. + status = U_ZERO_ERROR; + cbInfo.reset(4); + s = "aaaaaaaaaaaaaaaaaaaaaaab"; + matcher.reset(s); + REGEX_ASSERT(matcher.matches(status)==FALSE); + REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); + REGEX_ASSERT(cbInfo.numCalls == 4); + } + + +} #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ diff --git a/icu4c/source/test/intltest/regextst.h b/icu4c/source/test/intltest/regextst.h index 92472783df5..12b7e634662 100644 --- a/icu4c/source/test/intltest/regextst.h +++ b/icu4c/source/test/intltest/regextst.h @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2002-2007, International Business Machines Corporation and + * Copyright (c) 2002-2008, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -30,6 +30,7 @@ public: virtual void Extended(); virtual void Errors(); virtual void PerlTests(); + virtual void Callbacks(); // The following functions are internal to the regexp tests. virtual UBool doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line);