ICU-7666 Regex find progress callback (from branch to trunk)

X-SVN-Rev: 28201
This commit is contained in:
Michael Grady 2010-06-15 20:19:10 +00:00
parent 11e525a1a7
commit 8b6ce73317
6 changed files with 335 additions and 12 deletions

View file

@ -216,6 +216,8 @@ void RegexMatcher::init(UErrorCode &status) {
fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
fCallbackFn = NULL;
fCallbackContext = NULL;
fFindProgressCallbackFn = NULL;
fFindProgressCallbackContext = NULL;
fTraceDebug = FALSE;
fDeferredStatus = status;
fData = fSmallData;
@ -548,7 +550,6 @@ int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
}
//--------------------------------------------------------------------------------
//
// find()
@ -641,6 +642,8 @@ UBool RegexMatcher::find() {
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testStartLimit the last time through.
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
return FALSE;
}
U_ASSERT(FALSE);
@ -687,6 +690,8 @@ UBool RegexMatcher::find() {
return FALSE;
}
startPos = pos;
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
return FALSE;
}
}
U_ASSERT(FALSE);
@ -718,6 +723,8 @@ UBool RegexMatcher::find() {
return FALSE;
}
startPos = pos;
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
return FALSE;
}
}
U_ASSERT(FALSE);
@ -764,6 +771,8 @@ UBool RegexMatcher::find() {
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testStartLimit the last time through.
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
return FALSE;
}
} else {
for (;;) {
@ -792,6 +801,8 @@ UBool RegexMatcher::find() {
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testStartLimit the last time through.
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
return FALSE;
}
}
}
@ -925,6 +936,8 @@ UBool RegexMatcher::findUsingChunk() {
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testLen the last time through.
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
return FALSE;
}
U_ASSERT(FALSE);
@ -964,6 +977,8 @@ UBool RegexMatcher::findUsingChunk() {
fHitEnd = TRUE;
return FALSE;
}
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
return FALSE;
}
}
U_ASSERT(FALSE);
@ -991,6 +1006,8 @@ UBool RegexMatcher::findUsingChunk() {
fHitEnd = TRUE;
return FALSE;
}
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
return FALSE;
}
}
U_ASSERT(FALSE);
@ -1030,6 +1047,8 @@ UBool RegexMatcher::findUsingChunk() {
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testLen the last time through.
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
return FALSE;
}
} else {
for (;;) {
@ -1056,6 +1075,8 @@ UBool RegexMatcher::findUsingChunk() {
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testLen the last time through.
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
return FALSE;
}
}
}
@ -1889,7 +1910,6 @@ RegexMatcher &RegexMatcher::reset(UText *input) {
return *this;
}*/
RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
@ -2330,6 +2350,38 @@ void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
}
//--------------------------------------------------------------------------------
//
// setMatchCallback
//
//--------------------------------------------------------------------------------
void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
const void *context,
UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
fFindProgressCallbackFn = callback;
fFindProgressCallbackContext = context;
}
//--------------------------------------------------------------------------------
//
// getMatchCallback
//
//--------------------------------------------------------------------------------
void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
const void *&context,
UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
callback = fFindProgressCallbackFn;
context = fFindProgressCallbackContext;
}
//================================================================================
//
// Code following this point in this file is the internal
@ -2517,6 +2569,29 @@ void RegexMatcher::IncrementTime(UErrorCode &status) {
}
}
//--------------------------------------------------------------------------------
//
// ReportFindProgress This function is called once for each advance in the target
// string from the find() function, and calls the user progress callback
// function if there is one installed.
//
// NOTE:
//
// If the match operation needs to be aborted because the user
// callback asked for it, just set an error status.
// The engine will pick that up and stop in its outer loop.
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
if (fFindProgressCallbackFn != NULL) {
if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
return FALSE;
}
}
return TRUE;
}
//--------------------------------------------------------------------------------
//
// StateSave

View file

@ -1548,6 +1548,40 @@ public:
UErrorCode &status);
/**
* Set a progress callback function for use with find operations on this Matcher.
* During find operations, the callback will be invoked after each return from a
* match attempt, giving the application the opportunity to terminate a long-running
* find operation.
*
* @param callback A pointer to the user-supplied callback function.
* @param context User context pointer. The value supplied at the
* time the callback function is set will be saved
* and passed to the callback each time that it is called.
* @param status A reference to a UErrorCode to receive any errors.
* @draft ICU 4.6
*/
virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
const void *context,
UErrorCode &status);
/**
* Get the find progress callback function for this URegularExpression.
*
* @param callback Out paramater, receives a pointer to the user-supplied
* callback function.
* @param context Out parameter, receives the user context pointer that
* was set when uregex_setFindProgressCallback() was called.
* @param status A reference to a UErrorCode to receive any errors.
* @draft ICU 4.6
*/
virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
const void *&context,
UErrorCode &status);
/**
* setTrace Debug function, enable/disable tracing of the matching engine.
* For internal ICU development use only. DO NO USE!!!!
@ -1598,6 +1632,7 @@ private:
REStackFrame *resetStack();
inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
void IncrementTime(UErrorCode &status);
UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status);
int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
@ -1673,6 +1708,11 @@ private:
// NULL if there is no callback.
const void *fCallbackContext; // User Context ptr for callback function.
URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
// NULL if there is no callback.
const void *fFindProgressCallbackContext; // User Context ptr for callback function.
UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
UBool fTraceDebug; // Set true for debug tracing of match engine.

View file

@ -1256,6 +1256,67 @@ uregex_getMatchCallback(const URegularExpression *regexp,
UErrorCode *status);
/**
* Function pointer for a regular expression find/findNext callback function.
* When set, a callback function will be called during a find operation after each
* attempt at a match. If the call back function returns FALSE, the find
* operation will be terminated early.
*
* Note: the callback function must not call other functions on this
* URegularExpression
*
* @param context context pointer. The callback function will be invoked
* with the context specified at the time that
* uregex_setFindProgressCallback() is called.
* @param matchIndex the next index at which a match attempt will be attempted for this
* find operation. If this callback interrupts the search, this is the
* index at which a find/findNext operation may be re-initiated.
* @return TRUE to continue the matching operation.
* FALSE to terminate the matching operation.
* @draft ICU 4.6
*/
U_CDECL_BEGIN
typedef UBool U_CALLCONV URegexFindProgressCallback (
const void *context,
int64_t matchIndex);
U_CDECL_END
/**
* During find operations, this callback will be invoked after each return from a
* match attempt, specifying the next index at which a match operation is about to be attempted,
* giving the application the opportunity to terminate a long-running find operation.
*
* @param regexp The compiled regular expression.
* @param callback A pointer to the user-supplied callback function.
* @param context User context pointer. The value supplied at the
* time the callback function is set will be saved
* and passed to the callback each time that it is called.
* @param status A reference to a UErrorCode to receive any errors.
* @draft ICU 4.6
*/
U_STABLE void U_EXPORT2
uregex_setFindProgressCallback(URegularExpression *regexp,
URegexFindProgressCallback *callback,
const void *context,
UErrorCode *status);
/**
* Get the callback function for this URegularExpression.
*
* @param regexp The compiled regular expression.
* @param callback Out paramater, receives a pointer to the user-supplied
* callback function.
* @param context Out parameter, receives the user context pointer that
* was set when uregex_setFindProgressCallback() was called.
* @param status A reference to a UErrorCode to receive any errors.
* @draft ICU 4.6
*/
U_STABLE void U_EXPORT2
uregex_getFindProgressCallback(const URegularExpression *regexp,
URegexFindProgressCallback **callback,
const void **context,
UErrorCode *status);
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
#endif /* UREGEX_H */

View file

@ -997,6 +997,40 @@ uregex_getMatchCallback(const URegularExpression *regexp2,
}
//------------------------------------------------------------------------------
//
// uregex_setMatchProgressCallback
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_setFindProgressCallback(URegularExpression *regexp2,
URegexFindProgressCallback *callback,
const void *context,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, status)) {
regexp->fMatcher->setFindProgressCallback(callback, context, *status);
}
}
//------------------------------------------------------------------------------
//
// uregex_getMatchCallback
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_getFindProgressCallback(const URegularExpression *regexp2,
URegexFindProgressCallback **callback,
const void **context,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, status)) {
regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
}
}
//------------------------------------------------------------------------------
//
// uregex_replaceAll

View file

@ -77,30 +77,33 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
case 7: name = "Callbacks";
if (exec) Callbacks();
break;
case 8: name = "Bug 6149";
case 8: name = "FindProgressCallbacks";
if (exec) FindProgressCallbacks();
break;
case 9: name = "Bug 6149";
if (exec) Bug6149();
break;
case 9: name = "UTextBasic";
case 10: name = "UTextBasic";
if (exec) UTextBasic();
break;
case 10: name = "API_Match_UTF8";
case 11: name = "API_Match_UTF8";
if (exec) API_Match_UTF8();
break;
case 11: name = "API_Replace_UTF8";
case 12: name = "API_Replace_UTF8";
if (exec) API_Replace_UTF8();
break;
case 12: name = "API_Pattern_UTF8";
case 13: name = "API_Pattern_UTF8";
if (exec) API_Pattern_UTF8();
break;
case 13: name = "PerlTestsUTF8";
case 14: name = "PerlTestsUTF8";
if (exec) PerlTestsUTF8();
break;
case 14: name = "PreAllocatedUTextCAPI";
case 15: name = "PreAllocatedUTextCAPI";
if (exec) PreAllocatedUTextCAPI();
break;
case 15: name = "Bug 7651";
if (exec) Bug7651();
break;
case 16: name = "Bug 7651";
if (exec) Bug7651();
break;
default: name = "";
break; //needed to end loop
@ -4411,6 +4414,114 @@ void RegexTest::Callbacks() {
}
//
// FindProgressCallbacks() Test the find "progress" callback function.
// When set, the find progress callback will be invoked during a find operations
// after each return from a match attempt, giving the application the opportunity
// to terminate a long-running find operation before it's normal completion.
//
struct progressCallBackContext {
RegexTest *test;
int64_t lastIndex;
int32_t maxCalls;
int32_t numCalls;
void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
};
U_CDECL_BEGIN
static UBool U_CALLCONV
testProgressCallBackFn(const void *context, int64_t matchIndex) {
progressCallBackContext *info = (progressCallBackContext *)context;
info->numCalls++;
info->lastIndex = matchIndex;
// info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
return (info->numCalls < info->maxCalls);
}
U_CDECL_END
void RegexTest::FindProgressCallbacks() {
{
// Getter returns NULLs if no callback has been set
// The variables that the getter will fill in.
// Init to non-null values so that the action of the getter can be seen.
const void *returnedContext = &returnedContext;
URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
UErrorCode status = U_ZERO_ERROR;
RegexMatcher matcher("x", 0, status);
REGEX_CHECK_STATUS;
matcher.getFindProgressCallback(returnedFn, returnedContext, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(returnedFn == NULL);
REGEX_ASSERT(returnedContext == NULL);
}
{
// Set and Get work
progressCallBackContext cbInfo = {this, 0, 0, 0};
const void *returnedContext;
URegexFindProgressCallback *returnedFn;
UErrorCode status = U_ZERO_ERROR;
RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
REGEX_CHECK_STATUS;
matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
REGEX_CHECK_STATUS;
matcher.getFindProgressCallback(returnedFn, returnedContext, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(returnedFn == testProgressCallBackFn);
REGEX_ASSERT(returnedContext == &cbInfo);
// A short-running match should NOT invoke the callback.
status = U_ZERO_ERROR;
cbInfo.reset(100);
UnicodeString s = "abxxx";
matcher.reset(s);
#if 0
matcher.setTrace(TRUE);
#endif
REGEX_ASSERT(matcher.find(0, status));
REGEX_CHECK_STATUS;
REGEX_ASSERT(cbInfo.numCalls == 0);
// A medium running match that causes matcher.find() to invoke our callback for each index.
status = U_ZERO_ERROR;
s = "aaaaaaaaaaaaaaaaaaab";
cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
matcher.reset(s);
REGEX_ASSERT(matcher.find(0, status)==FALSE);
REGEX_CHECK_STATUS;
REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
// A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
status = U_ZERO_ERROR;
UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
matcher.reset(s1);
REGEX_ASSERT(matcher.find(0, status)==FALSE);
REGEX_CHECK_STATUS;
REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
#if 0
// Now a match that will succeed, but after an interruption
status = U_ZERO_ERROR;
UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
matcher.reset(s2);
REGEX_ASSERT(matcher.find(0, status)==FALSE);
REGEX_CHECK_STATUS;
// Now retry the match from where left off
cbInfo.maxCalls = 100; // No callback limit
REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
REGEX_CHECK_STATUS;
#endif
}
}
//---------------------------------------------------------------------------
//
// PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
@ -4619,6 +4730,7 @@ void RegexTest::PreAllocatedUTextCAPI () {
void RegexTest::Bug7651() {
UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
// The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
// It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
UnicodeString s("#ff @abcd This is test");
RegexPattern *REPattern = NULL;

View file

@ -34,6 +34,7 @@ public:
virtual void PerlTests();
virtual void Bug6149();
virtual void Callbacks();
virtual void FindProgressCallbacks();
virtual void UTextBasic();
virtual void API_Match_UTF8();
virtual void API_Pattern_UTF8();