ICU-7813 64bit regex API, 7675: UText-based Regex to use native indexes, 7764: Improved UText-regex API error handling, 7855: UText regex group API returns shallow clone, 7851: Set region and start position, 7763: Inline regex progress callback function.

X-SVN-Rev: 28647
This commit is contained in:
Michael Grady 2010-09-18 03:07:17 +00:00
parent efa8bfba9e
commit 751473d25a
7 changed files with 681 additions and 319 deletions

View file

@ -471,7 +471,7 @@ UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
utext_openUnicodeString(&resultText, &dest, &status);
if (U_SUCCESS(status)) {
appendTail(&resultText);
appendTail(&resultText, status);
utext_close(&resultText);
}
@ -481,9 +481,25 @@ UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
//
// appendTail, UText mode
//
UText *RegexMatcher::appendTail(UText *dest) {
UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
UBool bailOut = FALSE;
if (U_FAILURE(status)) {
bailOut = TRUE;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
bailOut = TRUE;
}
if (bailOut) {
// dest must not be NULL
if (dest) {
utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
return dest;
}
}
if (fInputLength > fAppendPosition) {
UErrorCode status = U_ZERO_ERROR;
if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
int64_t destLen = utext_nativeLength(dest);
utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
@ -522,9 +538,11 @@ int32_t RegexMatcher::end(UErrorCode &err) const {
return end(0, err);
}
int64_t RegexMatcher::end64(UErrorCode &err) const {
return end64(0, err);
}
int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
if (U_FAILURE(err)) {
return -1;
}
@ -548,13 +566,11 @@ int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
e = fFrame->fExtra[groupOffset + 1];
}
if (e == -1 || UTEXT_USES_U16(fInputText)) {
return (int32_t)e;
} else {
// !!!: Would like a better way to do this!
UErrorCode status = U_ZERO_ERROR;
return utext_extract(fInputText, 0, e, NULL, 0, &status);
}
return e;
}
int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
return (int32_t)end64(group, err);
}
@ -650,7 +666,7 @@ UBool RegexMatcher::find() {
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testStartLimit the last time through.
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
return FALSE;
}
U_ASSERT(FALSE);
@ -698,7 +714,7 @@ UBool RegexMatcher::find() {
return FALSE;
}
startPos = pos;
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
return FALSE;
}
}
@ -731,7 +747,7 @@ UBool RegexMatcher::find() {
return FALSE;
}
startPos = pos;
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
return FALSE;
}
}
@ -779,7 +795,7 @@ UBool RegexMatcher::find() {
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testStartLimit the last time through.
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
return FALSE;
}
} else {
@ -809,7 +825,7 @@ UBool RegexMatcher::find() {
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testStartLimit the last time through.
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
return FALSE;
}
}
@ -825,7 +841,7 @@ UBool RegexMatcher::find() {
UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
@ -840,25 +856,8 @@ UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
return FALSE;
}
UBool couldFindStart = TRUE;
int64_t nativeStart;
if (UTEXT_USES_U16(fInputText)) {
nativeStart = start;
} else {
UTEXT_SETNATIVEINDEX(fInputText, 0);
int32_t i = 0;
while (i < start) {
UChar32 c = UTEXT_NEXT32(fInputText);
if (c != U_SENTINEL) {
i += U16_LENGTH(c);
} else {
couldFindStart = FALSE;
break;
}
}
nativeStart = UTEXT_GETNATIVEINDEX(fInputText);
}
if (!couldFindStart || nativeStart < fActiveStart || nativeStart > fActiveLimit) {
int64_t nativeStart = start;
if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
@ -944,7 +943,7 @@ UBool RegexMatcher::findUsingChunk() {
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testLen the last time through.
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
return FALSE;
}
U_ASSERT(FALSE);
@ -985,7 +984,7 @@ UBool RegexMatcher::findUsingChunk() {
fHitEnd = TRUE;
return FALSE;
}
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
return FALSE;
}
}
@ -1014,7 +1013,7 @@ UBool RegexMatcher::findUsingChunk() {
fHitEnd = TRUE;
return FALSE;
}
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
return FALSE;
}
}
@ -1055,7 +1054,7 @@ UBool RegexMatcher::findUsingChunk() {
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testLen the last time through.
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
return FALSE;
}
} else {
@ -1083,7 +1082,7 @@ UBool RegexMatcher::findUsingChunk() {
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testLen the last time through.
if (ReportFindProgress(startPos, fDeferredStatus) == FALSE)
if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
return FALSE;
}
}
@ -1108,11 +1107,59 @@ UnicodeString RegexMatcher::group(UErrorCode &status) const {
return group(0, status);
}
UText *RegexMatcher::group(UText *dest, MatcherDestIsUTextFlag /*flag*/, UErrorCode &status) const {
return group(0, dest, status);
// Return immutable shallow clone
UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
return group(0, dest, group_len, status);
}
// Return immutable shallow clone
UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
group_len = 0;
UBool bailOut = FALSE;
if (U_FAILURE(status)) {
return dest;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
bailOut = TRUE;
}
if (fMatch == FALSE) {
status = U_REGEX_INVALID_STATE;
bailOut = TRUE;
}
if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
bailOut = TRUE;
}
if (bailOut) {
return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
}
int64_t s, e;
if (groupNum == 0) {
s = fMatchStart;
e = fMatchEnd;
} else {
int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
U_ASSERT(groupOffset < fPattern->fFrameSize);
U_ASSERT(groupOffset >= 0);
s = fFrame->fExtra[groupOffset];
e = fFrame->fExtra[groupOffset+1];
}
if (s < 0) {
// A capture group wasn't part of the match
return utext_clone(dest, fInputText, FALSE, TRUE, &status);
}
U_ASSERT(s <= e);
group_len = e - s;
dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
if (dest)
UTEXT_SETNATIVEINDEX(dest, s);
return dest;
}
UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
UnicodeString result;
@ -1127,6 +1174,9 @@ UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
}
// Return deep (mutable) clone
// Technology Preview (as an API), but note that the UnicodeString API is implemented
// using this function.
UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
UBool bailOut = FALSE;
if (U_FAILURE(status)) {
@ -1372,8 +1422,25 @@ UText *RegexMatcher::inputText() const {
// getInput() -- like inputText(), but makes a clone or copies into another UText
//
//--------------------------------------------------------------------------------
UText *RegexMatcher::getInput (UText *dest) const {
UErrorCode status = U_ZERO_ERROR; // ignored
UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
UBool bailOut = FALSE;
if (U_FAILURE(status)) {
return dest;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
bailOut = TRUE;
}
if (bailOut) {
if (dest) {
utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
return dest;
} else {
return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
}
}
if (dest) {
if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
@ -1462,7 +1529,7 @@ UBool RegexMatcher::lookingAt(UErrorCode &status) {
}
UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) {
UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
@ -1485,24 +1552,8 @@ UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) {
}
int64_t nativeStart;
UBool couldFindStart = TRUE;
if (UTEXT_USES_U16(fInputText)) {
nativeStart = start;
} else {
UTEXT_SETNATIVEINDEX(fInputText, 0);
int32_t i = 0;
while (i < start) {
UChar32 c = UTEXT_NEXT32(fInputText);
if (c != U_SENTINEL) {
i += U16_LENGTH(c);
} else {
couldFindStart = FALSE;
break;
}
}
nativeStart = UTEXT_GETNATIVEINDEX(fInputText);
}
if (!couldFindStart || nativeStart < fActiveStart || nativeStart > fActiveLimit) {
nativeStart = start;
if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
@ -1550,7 +1601,7 @@ UBool RegexMatcher::matches(UErrorCode &status) {
}
UBool RegexMatcher::matches(int32_t start, UErrorCode &status) {
UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
@ -1573,24 +1624,8 @@ UBool RegexMatcher::matches(int32_t start, UErrorCode &status) {
}
int64_t nativeStart;
UBool couldFindStart = TRUE;
if (UTEXT_USES_U16(fInputText)) {
nativeStart = start;
} else {
UTEXT_SETNATIVEINDEX(fInputText, 0);
int32_t i = 0;
while (i < start) {
UChar32 c = UTEXT_NEXT32(fInputText);
if (c != U_SENTINEL) {
i += U16_LENGTH(c);
} else {
couldFindStart = FALSE;
break;
}
}
nativeStart = UTEXT_GETNATIVEINDEX(fInputText);
}
if (!couldFindStart || nativeStart < fActiveStart || nativeStart > fActiveLimit) {
nativeStart = start;
if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
@ -1621,65 +1656,38 @@ const RegexPattern &RegexMatcher::pattern() const {
// region
//
//--------------------------------------------------------------------------------
RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &status) {
RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
if (start>limit || start<0 || limit<0) {
if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
int64_t nativeStart;
int32_t i = 0;
UBool couldFindStart = TRUE;
if (UTEXT_USES_U16(fInputText)) {
nativeStart = start;
couldFindStart = (nativeStart <= fInputLength);
} else {
UTEXT_SETNATIVEINDEX(fInputText, 0);
while (i < start) {
UChar32 c = UTEXT_NEXT32(fInputText);
if (c != U_SENTINEL) {
i += U16_LENGTH(c);
} else {
couldFindStart = FALSE;
break;
}
}
nativeStart = UTEXT_GETNATIVEINDEX(fInputText);
}
int64_t nativeLimit = nativeStart;
if (!couldFindStart) {
status = U_ILLEGAL_ARGUMENT_ERROR;
} else {
UBool couldFindLimit = TRUE;
if (UTEXT_USES_U16(fInputText)) {
nativeLimit = limit;
couldFindLimit = (nativeLimit <= fInputLength);
} else {
while (i < limit) {
UChar32 c = UTEXT_NEXT32(fInputText);
if (c != U_SENTINEL) {
i += U16_LENGTH(c);
} else {
couldFindLimit = FALSE;
break;
}
}
nativeLimit = UTEXT_GETNATIVEINDEX(fInputText);
}
if (!couldFindLimit) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
int64_t nativeStart = regionStart;
int64_t nativeLimit = regionLimit;
if (nativeStart > fInputLength || nativeLimit > fInputLength) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
this->reset();
if (startIndex == -1)
this->reset();
else
resetPreserveRegion();
fRegionStart = nativeStart;
fRegionLimit = nativeLimit;
fActiveStart = nativeStart;
fActiveLimit = nativeLimit;
if (startIndex != -1) {
if (startIndex < fActiveStart || startIndex > fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
}
fMatchEnd = startIndex;
}
if (!fTransparentBounds) {
fLookStart = nativeStart;
fLookLimit = nativeLimit;
@ -1691,7 +1699,9 @@ RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &sta
return *this;
}
RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
return region(start, limit, -1, status);
}
//--------------------------------------------------------------------------------
//
@ -1699,15 +1709,12 @@ RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &sta
//
//--------------------------------------------------------------------------------
int32_t RegexMatcher::regionEnd() const {
if (UTEXT_USES_U16(fInputText)) {
return (int32_t)fRegionLimit;
} else {
// !!!: Would like a better way to do this!
UErrorCode status = U_ZERO_ERROR;
return utext_extract(fInputText, 0, fRegionLimit, NULL, 0, &status);
}
return (int32_t)fRegionLimit;
}
int64_t RegexMatcher::regionEnd64() const {
return fRegionLimit;
}
//--------------------------------------------------------------------------------
//
@ -1715,13 +1722,11 @@ int32_t RegexMatcher::regionEnd() const {
//
//--------------------------------------------------------------------------------
int32_t RegexMatcher::regionStart() const {
if (UTEXT_USES_U16(fInputText)) {
return (int32_t)fRegionStart;
} else {
// !!!: Would like a better way to do this!
UErrorCode status = U_ZERO_ERROR;
return utext_extract(fInputText, 0, fRegionStart, NULL, 0, &status);
}
return (int32_t)fRegionStart;
}
int64_t RegexMatcher::regionStart64() const {
return fRegionStart;
}
@ -1779,7 +1784,7 @@ UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &sta
break;
}
}
appendTail(dest);
appendTail(dest, status);
}
return dest;
@ -1821,7 +1826,7 @@ UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &s
reset();
if (!find()) {
return getInput(dest);
return getInput(dest, status);
}
if (dest == NULL) {
@ -1834,7 +1839,7 @@ UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &s
}
appendReplacement(dest, replacement, status);
appendTail(dest);
appendTail(dest, status);
return dest;
}
@ -1936,7 +1941,7 @@ RegexMatcher &RegexMatcher::reset(UText *input) {
return *this;
}*/
RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) {
RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
@ -1948,24 +1953,8 @@ RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) {
}
int64_t nativePos;
UBool couldFindStart = TRUE;
if (UTEXT_USES_U16(fInputText)) {
nativePos = position;
} else {
UTEXT_SETNATIVEINDEX(fInputText, 0);
int32_t i = 0;
while (i < position) {
UChar32 c = UTEXT_NEXT32(fInputText);
if (c != U_SENTINEL) {
i += U16_LENGTH(c);
} else {
couldFindStart = FALSE;
break;
}
}
nativePos = UTEXT_GETNATIVEINDEX(fInputText);
}
if (!couldFindStart || nativePos < fActiveStart || nativePos >= fActiveLimit) {
nativePos = position;
if (nativePos < fActiveStart || nativePos >= fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return *this;
}
@ -2224,15 +2213,17 @@ int32_t RegexMatcher::start(UErrorCode &status) const {
return start(0, status);
}
int64_t RegexMatcher::start64(UErrorCode &status) const {
return start64(0, status);
}
//--------------------------------------------------------------------------------
//
// start(int32_t group, UErrorCode &status)
//
//--------------------------------------------------------------------------------
int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
if (U_FAILURE(status)) {
return -1;
}
@ -2258,16 +2249,13 @@ int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
s = fFrame->fExtra[groupOffset];
}
if (s == -1 || UTEXT_USES_U16(fInputText)) {
return (int32_t)s;
} else {
// !!!: Would like a better way to do this!
UErrorCode status = U_ZERO_ERROR;
return utext_extract(fInputText, 0, s, NULL, 0, &status);
}
return s;
}
int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
return (int32_t)start64(group, status);
}
//--------------------------------------------------------------------------------
//

View file

@ -581,11 +581,13 @@ UnicodeString RegexPattern::pattern() const {
// patternText
//
//---------------------------------------------------------------------
UText *RegexPattern::patternText() const {
UText *RegexPattern::patternText(UErrorCode &status) const {
if (U_FAILURE(status)) {return NULL;}
status = U_ZERO_ERROR;
if (fPattern != NULL) {
return fPattern;
} else {
UErrorCode status = U_ZERO_ERROR;
RegexStaticSets::initGlobals(&status);
return RegexStaticSets::gStaticSets->fEmptyText;
}

View file

@ -214,7 +214,7 @@ public:
* @param status A reference to a UErrorCode to receive any errors.
* @return A regexPattern object for the compiled pattern.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
static RegexPattern * U_EXPORT2 compile( UText *regex,
UParseError &pe,
@ -274,7 +274,7 @@ public:
* @param status A reference to a UErrorCode to receive any errors.
* @return A regexPattern object for the compiled pattern.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
static RegexPattern * U_EXPORT2 compile( UText *regex,
uint32_t flags,
@ -331,7 +331,7 @@ public:
* @param status A reference to a UErrorCode to receive any errors.
* @return A regexPattern object for the compiled pattern.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
static RegexPattern * U_EXPORT2 compile( UText *regex,
uint32_t flags,
@ -368,7 +368,7 @@ public:
/**
* Flag to disambiguate RegexPattern::matcher signature
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
enum PatternIsUTextFlag { PATTERN_IS_UTEXT };
@ -389,7 +389,7 @@ public:
* @param status A reference to a UErrorCode to receive any errors.
* @return A RegexMatcher object for this pattern and input.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
virtual RegexMatcher *matcher(UText *input,
PatternIsUTextFlag flag,
@ -460,7 +460,7 @@ public:
* @param status A reference to a UErrorCode to receive any errors.
* @return True if the regular expression exactly matches the full input string.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
static UBool U_EXPORT2 matches(UText *regex,
UText *input,
@ -487,9 +487,9 @@ public:
* UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern
* object.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.6
*/
virtual UText *patternText() const;
virtual UText *patternText(UErrorCode &status) const;
/**
@ -546,7 +546,7 @@ public:
* @param status A reference to a UErrorCode to receive any errors.
* @return The number of fields into which the input string was split.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
virtual int32_t split(UText *input,
UText *dest[],
@ -677,7 +677,7 @@ public:
* @see UREGEX_CASE_INSENSITIVE
* @param status Any errors are reported by setting this UErrorCode variable.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
@ -724,7 +724,7 @@ public:
* @see UREGEX_CASE_INSENSITIVE
* @param status Any errors are reported by setting this UErrorCode variable.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
RegexMatcher(UText *regexp, UText *input,
uint32_t flags, UErrorCode &status);
@ -770,12 +770,12 @@ public:
* at the specified startIndex, and extending to the end of the input.
* The input region is reset to include the entire input string.
* A successful match must extend to the end of the input.
* @param startIndex The input string index at which to begin matching.
* @param startIndex The input string (native) index at which to begin matching.
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match
* @stable ICU 2.8
*/
virtual UBool matches(int32_t startIndex, UErrorCode &status);
virtual UBool matches(int64_t startIndex, UErrorCode &status);
/**
@ -802,12 +802,12 @@ public:
* <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
* <code>end()</code>, and <code>group()</code> functions.</p>
*
* @param startIndex The input string index at which to begin matching.
* @param startIndex The input string (native) index at which to begin matching.
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match.
* @stable ICU 2.8
*/
virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);
virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
/**
@ -829,12 +829,12 @@ public:
* Resets this RegexMatcher and then attempts to find the next substring of the
* input string that matches the pattern, starting at the specified index.
*
* @param start the position in the input string to begin the search
* @param start The (native) index in the input string to begin the search.
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if a match is found.
* @stable ICU 2.4
*/
virtual UBool find(int32_t start, UErrorCode &status);
virtual UBool find(int64_t start, UErrorCode &status);
/**
@ -849,30 +849,6 @@ public:
virtual UnicodeString group(UErrorCode &status) const;
/**
* Flag to disambiguate RegexMatcher::group signature
* @internal ICU 4.4 technology preview
*/
enum MatcherDestIsUTextFlag { MATCHER_DEST_IS_UTEXT };
/**
* Returns a string containing the text matched by the previous match.
* If the pattern can match an empty string, an empty string may be returned.
* @param dest A mutable UText in which the matching text is placed.
* If NULL, a new UText will be created (which may not be mutable).
* @param flag Must be RegexMatcher::MATCHER_DEST_IS_UTEXT; used to
* disambiguate method signature.
* @param status A reference to a UErrorCode to receive any errors.
* Possible errors are U_REGEX_INVALID_STATE if no match
* has been attempted or the last match failed.
* @return A string containing the matched input text. If a pre-allocated UText
* was provided, it will always be used and returned.
*
* @internal ICU 4.4 technology preview
*/
virtual UText *group(UText *dest, MatcherDestIsUTextFlag flag, UErrorCode &status) const;
/**
* Returns a string containing the text captured by the given group
* during the previous match operation. Group(0) is the entire match.
@ -888,6 +864,31 @@ public:
virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
/**
* Returns the number of capturing groups in this matcher's pattern.
* @return the number of capture groups
* @stable ICU 2.4
*/
virtual int32_t groupCount() const;
/**
* Returns a shallow clone of the entire live input string with the UText current native index
* set to the beginning of the requested group.
* Note that copying the entire input string may cause significant performance and memory issues.
* @param dest The UText into which the input should be copied, or NULL to create a new UText
* @param group_len A reference to receive the length of the desired capture group
* @param status A reference to a UErrorCode to receive any errors.
* Possible errors are U_REGEX_INVALID_STATE if no match
* has been attempted or the last match failed and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
* @return dest if non-NULL, a shallow copy of the input text otherwise
*
* @draft ICU 4.6
*/
virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
/**
* Returns a string containing the text captured by the given group
* during the previous match operation. Group(0) is the entire match.
@ -906,23 +907,20 @@ public:
virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
/**
* Returns the number of capturing groups in this matcher's pattern.
* @return the number of capture groups
* @stable ICU 2.4
*/
virtual int32_t groupCount() const;
/**
* Returns the index in the input string of the start of the text matched
* during the previous match operation.
* @param status a reference to a UErrorCode to receive any errors.
* @return The position in the input string of the start of the last match.
* @return The (native) position in the input string of the start of the last match.
* @stable ICU 2.4
*/
virtual int32_t start(UErrorCode &status) const;
/**
* @draft ICU 4.6
*/
virtual int64_t start64(UErrorCode &status) const;
/**
* Returns the index in the input string of the start of the text matched by the
@ -934,11 +932,16 @@ public:
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed, and
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
* @return the start position of substring matched by the specified group.
* @return the (native) start position of substring matched by the specified group.
* @stable ICU 2.4
*/
virtual int32_t start(int32_t group, UErrorCode &status) const;
/**
* @draft ICU 4.6
*/
virtual int64_t start64(int32_t group, UErrorCode &status) const;
/**
* Returns the index in the input string of the first character following the
@ -947,10 +950,18 @@ public:
* errors are U_REGEX_INVALID_STATE if no match has been
* attempted or the last match failed.
* @return the index of the last character matched, plus one.
* The index value returned is a native index, corresponding to
* code units for the underlying encoding type, for example,
* a byte index for UTF8.
* @stable ICU 2.4
*/
virtual int32_t end(UErrorCode &status) const;
/**
* @draft ICU 4.6
*/
virtual int64_t end64(UErrorCode &status) const;
/**
* Returns the index in the input string of the character following the
@ -963,10 +974,18 @@ public:
* @return the index of the first character following the text
* captured by the specifed group during the previous match operation.
* Return -1 if the capture group exists in the pattern but was not part of the match.
* The index value returned is a native index, corresponding to
* code units for the underlying encoding type, for example,
* a byte index for UTF8.
* @stable ICU 2.4
*/
virtual int32_t end(int32_t group, UErrorCode &status) const;
/**
* @draft ICU 4.6
*/
virtual int64_t end64(int32_t group, UErrorCode &status) const;
/**
* Resets this matcher. The effect is to remove any memory of previous matches,
@ -983,7 +1002,7 @@ public:
* Resets this matcher, and set the current input position.
* The effect is to remove any memory of previous matches,
* and to cause subsequent find() operations to begin at
* the specified position in the input string.
* the specified (native) position in the input string.
* <p>
* The matcher's region is reset to its default, which is the entire
* input string.
@ -994,7 +1013,7 @@ public:
* @return this RegexMatcher.
* @stable ICU 2.8
*/
virtual RegexMatcher &reset(int32_t index, UErrorCode &status);
virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
/**
@ -1028,7 +1047,7 @@ public:
* until after regexp operations on it are done.
* @return this RegexMatcher.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
virtual RegexMatcher &reset(UText *input);
@ -1064,7 +1083,7 @@ public:
* a UnicodeString.
* @return the input text
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
virtual UText *inputText() const;
@ -1075,9 +1094,9 @@ public:
* @param dest The UText into which the input should be copied, or NULL to create a new UText
* @return dest if non-NULL, a shallow copy of the input text otherwise
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.6
*/
virtual UText *getInput(UText *dest) const;
virtual UText *getInput(UText *dest, UErrorCode &status) const;
/** Sets the limits of this matcher's region.
@ -1093,35 +1112,55 @@ public:
* The function will fail if start is greater than limit, or if either index
* is less than zero or greater than the length of the string being matched.
*
* @param start The index to begin searches at.
* @param start The (native) index to begin searches at.
* @param limit The index to end searches at (exclusive).
* @param status A reference to a UErrorCode to receive any errors.
* @stable ICU 4.0
*/
virtual RegexMatcher &region(int32_t start, int32_t limit, UErrorCode &status);
virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
/**
* Identical to region(start, limit, status) but also allows a start position without
* resetting the region state.
* @param startIndex The (native) index within the region bounds at which to begin searches.
* @param status A reference to a UErrorCode to receive any errors.
* If startIndex is not within the specified region bounds,
* U_INDEX_OUTOFBOUNDS_ERROR is returned.
* @draft ICU 4.6
*/
virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
/**
* Reports the start index of this matcher's region. The searches this matcher
* conducts are limited to finding matches within regionStart (inclusive) and
* regionEnd (exclusive).
*
* @return The starting index of this matcher's region.
* @return The starting (native) index of this matcher's region.
* @stable ICU 4.0
*/
virtual int32_t regionStart() const;
/**
* @draft ICU 4.6
*/
virtual int64_t regionStart64() const;
/**
* Reports the end (limit) index (exclusive) of this matcher's region. The searches
* this matcher conducts are limited to finding matches within regionStart
* (inclusive) and regionEnd (exclusive).
*
* @return The ending point of this matcher's region.
* @return The ending point (native) of this matcher's region.
* @stable ICU 4.0
*/
virtual int32_t regionEnd() const;
/**
* @draft ICU 4.6
*/
virtual int64_t regionEnd64() const;
/**
* Queries the transparency of region bounds for this matcher.
* See useTransparentBounds for a description of transparent and opaque bounds.
@ -1249,7 +1288,7 @@ public:
* @return a string containing the results of the find and replace.
* If a pre-allocated UText was provided, it will always be used and returned.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
@ -1299,7 +1338,7 @@ public:
* @return a string containing the results of the find and replace.
* If a pre-allocated UText was provided, it will always be used and returned.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
@ -1360,7 +1399,7 @@ public:
*
* @return this RegexMatcher
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
virtual RegexMatcher &appendReplacement(UText *dest,
UText *replacement, UErrorCode &status);
@ -1389,9 +1428,9 @@ public:
* Must not be NULL.
* @return the destination string.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.6
*/
virtual UText *appendTail(UText *dest);
virtual UText *appendTail(UText *dest, UErrorCode &status);
/**
@ -1444,7 +1483,7 @@ public:
* @param status A reference to a UErrorCode to receive any errors.
* @return The number of fields into which the input string was split.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
virtual int32_t split(UText *input,
UText *dest[],
@ -1581,6 +1620,15 @@ public:
UErrorCode &status);
/**
* inline version of ReportFindProgress() to eliminate function calls where a check for
* the callback suffices.
*
* @draft ICU 4.6
*/
#define REGEXFINDPROGRESS_INTERRUPT(pos, status) \
(fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
/**
* setTrace Debug function, enable/disable tracing of the matching engine.

View file

@ -158,9 +158,9 @@ uregex_open( const UChar *pattern,
* information is not wanted, pass NULL for this parameter.
* @param status Receives error detected by this function.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
U_INTERNAL URegularExpression * U_EXPORT2
U_DRAFT URegularExpression * U_EXPORT2
uregex_openUText(UText *pattern,
uint32_t flags,
UParseError *pe,
@ -280,9 +280,9 @@ uregex_pattern(const URegularExpression *regexp,
* @return the pattern text. The storage for the text is owned by the regular expression
* object, and must not be altered or deleted.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
U_INTERNAL UText * U_EXPORT2
U_DRAFT UText * U_EXPORT2
uregex_patternUText(const URegularExpression *regexp,
UErrorCode *status);
@ -341,9 +341,9 @@ uregex_setText(URegularExpression *regexp,
* @param text The subject text string.
* @param status Receives errors detected by this function.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
U_INTERNAL void U_EXPORT2
U_DRAFT void U_EXPORT2
uregex_setUText(URegularExpression *regexp,
UText *text,
UErrorCode *status);
@ -388,9 +388,9 @@ uregex_getText(URegularExpression *regexp,
* @return The subject text currently associated with this regular expression.
* If a pre-allocated UText was provided, it will always be used and returned.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
U_INTERNAL UText * U_EXPORT2
U_DRAFT UText * U_EXPORT2
uregex_getUText(URegularExpression *regexp,
UText *dest,
UErrorCode *status);
@ -409,7 +409,7 @@ uregex_getUText(URegularExpression *regexp,
* Matcher.matches() in Java
*
* @param regexp The compiled regular expression.
* @param startIndex The input string index at which to begin matching, or -1
* @param startIndex The input string (native) index at which to begin matching, or -1
* to match the input Region.
* @param status Receives errors detected by this function.
* @return TRUE if there is a match
@ -420,6 +420,15 @@ uregex_matches(URegularExpression *regexp,
int32_t startIndex,
UErrorCode *status);
/**
* 64bit version of uregex_matches.
* @draft ICU 4.6
*/
U_DRAFT UBool U_EXPORT2
uregex_matches64(URegularExpression *regexp,
int64_t startIndex,
UErrorCode *status);
/**
* Attempts to match the input string, starting from the specified index, against the pattern.
* The match may be of any length, and is not required to extend to the end
@ -437,7 +446,7 @@ uregex_matches(URegularExpression *regexp,
* and <code>uregexp_group()</code> functions.</p>
*
* @param regexp The compiled regular expression.
* @param startIndex The input string index at which to begin matching, or
* @param startIndex The input string (native) index at which to begin matching, or
* -1 to match the Input Region
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match.
@ -448,6 +457,15 @@ uregex_lookingAt(URegularExpression *regexp,
int32_t startIndex,
UErrorCode *status);
/**
* 64bit version of uregex_lookingAt.
* @draft ICU 4.6
*/
U_DRAFT UBool U_EXPORT2
uregex_lookingAt64(URegularExpression *regexp,
int64_t startIndex,
UErrorCode *status);
/**
* Find the first matching substring of the input string that matches the pattern.
* If startIndex is >= zero the search for a match begins at the specified index,
@ -461,7 +479,7 @@ uregex_lookingAt(URegularExpression *regexp,
* <code>uregex_group()</code> will provide more information regarding the match.
*
* @param regexp The compiled regular expression.
* @param startIndex The position in the input string to begin the search, or
* @param startIndex The position (native) in the input string to begin the search, or
* -1 to search within the Input Region.
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if a match is found.
@ -472,6 +490,15 @@ uregex_find(URegularExpression *regexp,
int32_t startIndex,
UErrorCode *status);
/**
* 64bit version of uregex_find.
* @draft ICU 4.6
*/
U_DRAFT UBool U_EXPORT2
uregex_find64(URegularExpression *regexp,
int64_t startIndex,
UErrorCode *status);
/**
* Find the next pattern match in the input string. Begin searching
* the input at the location following the end of he previous match,
@ -523,6 +550,37 @@ uregex_group(URegularExpression *regexp,
int32_t destCapacity,
UErrorCode *status);
/** Returns a shallow immutable clone of the entire input string. The returned UText current native index
* is set to the beginning of the requested capture group. The capture group length is also
* returned via groupLength.
* Group #0 is the complete string of matched text.
* Group #1 is the text matched by the first set of capturing parentheses.
*
* @param regexp The compiled regular expression.
* @param groupNum The capture group to extract. Group 0 is the complete
* match. The value of this parameter must be
* less than or equal to the number of capture groups in
* the pattern.
* @param dest A mutable UText in which to store the current input.
* If NULL, a new UText will be created as an immutable shallow clone
* of the entire input string.
* @param groupLength The group length of the desired capture group.
* @param status A reference to a UErrorCode to receive any errors.
* @return The subject text currently associated with this regular expression.
* If a pre-allocated UText was provided, it will always be used and returned.
*
* @draft ICU 4.6
*/
U_DRAFT UText * U_EXPORT2
uregex_groupUText(URegularExpression *regexp,
int32_t groupNum,
UText *dest,
int64_t *groupLength,
UErrorCode *status);
/** Extract the string for the specified matching expression or subexpression.
* Group #0 is the complete string of matched text.
* Group #1 is the text matched by the first set of capturing parentheses.
@ -541,12 +599,11 @@ uregex_group(URegularExpression *regexp,
* @internal ICU 4.4 technology preview
*/
U_INTERNAL UText * U_EXPORT2
uregex_groupUText(URegularExpression *regexp,
uregex_groupUTextDeep(URegularExpression *regexp,
int32_t groupNum,
UText *dest,
UErrorCode *status);
/**
* Returns the index in the input string of the start of the text matched by the
* specified capture group during the previous match operation. Return -1 if
@ -557,7 +614,7 @@ uregex_groupUText(URegularExpression *regexp,
* @param regexp The compiled regular expression.
* @param groupNum The capture group number
* @param status A reference to a UErrorCode to receive any errors.
* @return the starting position in the input of the text matched
* @return the starting (native) position in the input of the text matched
* by the specified group.
* @stable ICU 3.0
*/
@ -566,6 +623,15 @@ uregex_start(URegularExpression *regexp,
int32_t groupNum,
UErrorCode *status);
/**
* 64bit version of uregex_start.
* @draft ICU 4.6
*/
U_DRAFT int64_t U_EXPORT2
uregex_start64(URegularExpression *regexp,
int32_t groupNum,
UErrorCode *status);
/**
* Returns the index in the input string of the position following the end
* of the text matched by the specified capture group.
@ -576,7 +642,7 @@ uregex_start(URegularExpression *regexp,
* @param regexp The compiled regular expression.
* @param groupNum The capture group number
* @param status A reference to a UErrorCode to receive any errors.
* @return the index of the position following the last matched character.
* @return the (native) index of the position following the last matched character.
* @stable ICU 3.0
*/
U_STABLE int32_t U_EXPORT2
@ -584,6 +650,15 @@ uregex_end(URegularExpression *regexp,
int32_t groupNum,
UErrorCode *status);
/**
* 64bit version of uregex_end.
* @draft ICU 4.6
*/
U_DRAFT int64_t U_EXPORT2
uregex_end64(URegularExpression *regexp,
int32_t groupNum,
UErrorCode *status);
/**
* Reset any saved state from the previous match. Has the effect of
* causing uregex_findNext to begin at the specified index, and causing
@ -592,7 +667,7 @@ uregex_end(URegularExpression *regexp,
* match region that may have been set.
*
* @param regexp The compiled regular expression.
* @param index The position in the text at which a
* @param index The position (native) in the text at which a
* uregex_findNext() should begin searching.
* @param status A reference to a UErrorCode to receive any errors.
* @stable ICU 3.0
@ -602,7 +677,15 @@ uregex_reset(URegularExpression *regexp,
int32_t index,
UErrorCode *status);
/**
* 64bit version of uregex_reset.
* @draft ICU 4.6
*/
U_DRAFT void U_EXPORT2
uregex_reset64(URegularExpression *regexp,
int64_t index,
UErrorCode *status);
/** Sets the limits of the matching region for this URegularExpression.
* The region is the part of the input string that will be considered when matching.
* Invoking this method resets any saved state from the previous match,
@ -617,8 +700,8 @@ uregex_reset(URegularExpression *regexp,
* is less than zero or greater than the length of the string being matched.
*
* @param regexp The compiled regular expression.
* @param regionStart The index to begin searches at.
* @param regionLimit The index to end searches at (exclusive).
* @param regionStart The (native) index to begin searches at.
* @param regionLimit The (native) index to end searches at (exclusive).
* @param status A pointer to a UErrorCode to receive any errors.
* @stable ICU 4.0
*/
@ -628,20 +711,48 @@ uregex_setRegion(URegularExpression *regexp,
int32_t regionLimit,
UErrorCode *status);
/**
* 64bit version of uregex_setRegion.
* @draft ICU 4.6
*/
U_DRAFT void U_EXPORT2
uregex_setRegion64(URegularExpression *regexp,
int64_t regionStart,
int64_t regionLimit,
UErrorCode *status);
/**
* Variation on uregex_setRegion to set the region without resetting the start index
* without resetting the position for subsequent matches.
* @draft ICU 4.6
*/
U_DRAFT void U_EXPORT2
uregex_setRegionAndStart(URegularExpression *regexp,
int64_t regionStart,
int64_t regionLimit,
int64_t startIndex,
UErrorCode *status);
/**
* Reports the start index of the matching region. Any matches found are limited to
* to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
*
* @param regexp The compiled regular expression.
* @param status A pointer to a UErrorCode to receive any errors.
* @return The starting index of this matcher's region.
* @return The starting (native) index of this matcher's region.
* @stable ICU 4.0
*/
U_STABLE int32_t U_EXPORT2
uregex_regionStart(const URegularExpression *regexp,
UErrorCode *status);
/**
* 64bit version of uregex_regionStart.
* @draft ICU 4.6
*/
U_DRAFT int64_t U_EXPORT2
uregex_regionStart64(const URegularExpression *regexp,
UErrorCode *status);
/**
* Reports the end index (exclusive) of the matching region for this URegularExpression.
@ -650,13 +761,21 @@ uregex_regionStart(const URegularExpression *regexp,
*
* @param regexp The compiled regular expression.
* @param status A pointer to a UErrorCode to receive any errors.
* @return The ending point of this matcher's region.
* @return The ending point (native) of this matcher's region.
* @stable ICU 4.0
*/
U_STABLE int32_t U_EXPORT2
uregex_regionEnd(const URegularExpression *regexp,
UErrorCode *status);
/**
* 64bit version of uregex_regionEnd.
* @draft ICU 4.6
*/
U_DRAFT int64_t U_EXPORT2
uregex_regionEnd64(const URegularExpression *regexp,
UErrorCode *status);
/**
* Queries the transparency of region bounds for this URegularExpression.
* See useTransparentBounds for a description of transparent and opaque bounds.
@ -813,9 +932,9 @@ uregex_replaceAll(URegularExpression *regexp,
* @return A UText containing the results of the find and replace.
* If a pre-allocated UText was provided, it will always be used and returned.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
U_INTERNAL UText * U_EXPORT2
U_DRAFT UText * U_EXPORT2
uregex_replaceAllUText(URegularExpression *regexp,
UText *replacement,
UText *dest,
@ -872,9 +991,9 @@ uregex_replaceFirst(URegularExpression *regexp,
* @return A UText containing the results of the find and replace.
* If a pre-allocated UText was provided, it will always be used and returned.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
U_INTERNAL UText * U_EXPORT2
U_DRAFT UText * U_EXPORT2
uregex_replaceFirstUText(URegularExpression *regexp,
UText *replacement,
UText *dest,
@ -956,9 +1075,9 @@ uregex_appendReplacement(URegularExpression *regexp,
* @param dest A mutable UText that will receive the result. Must not be NULL.
* @param status A reference to a UErrorCode to receive any errors.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
U_INTERNAL void U_EXPORT2
U_DRAFT void U_EXPORT2
uregex_appendReplacementUText(URegularExpression *regexp,
UText *replacementText,
UText *dest,
@ -1009,11 +1128,12 @@ uregex_appendTail(URegularExpression *regexp,
* @param dest A mutable UText that will receive the result. Must not be NULL.
* @return The destination UText.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.6
*/
U_INTERNAL UText * U_EXPORT2
U_DRAFT UText * U_EXPORT2
uregex_appendTailUText(URegularExpression *regexp,
UText *dest);
UText *dest,
UErrorCode *status);
@ -1105,9 +1225,9 @@ uregex_split( URegularExpression *regexp,
* @param status A reference to a UErrorCode to receive any errors.
* @return The number of fields into which the input string was split.
*
* @internal ICU 4.4 technology preview
* @draft ICU 4.4
*/
U_INTERNAL int32_t U_EXPORT2
U_DRAFT int32_t U_EXPORT2
uregex_splitUText(URegularExpression *regexp,
UText *destFields[],
int32_t destFieldsCapacity,
@ -1257,10 +1377,21 @@ uregex_getMatchCallback(const URegularExpression *regexp,
/**
* Function pointer for a regular expression find/findNext callback function.
* When set, a callback function will be called during a find operation after each
* attempt at a match. If the call back function returns FALSE, the find
* operation will be terminated early.
* Function pointer for a regular expression find callback function.
*
* When set, a callback function will be called during a find operation
* and for operations that depend on find, such as findNext, split and some replace
* operations like replaceFirst.
* The callback will usually be called after each attempt at a match, but this is not a
* guarantee that the callback will be invoked at each character. For finds where the
* match engine is invoked at each character, this may be close to true, but less likely
* for more optimized loops where the pattern is known to only start, and the match
* engine invoked, at certain characters.
* When invoked, this callback will specify the index at which a match operation is about
* to be attempted, giving the application the opportunity to terminate a long-running
* find operation.
*
* If the call back function returns FALSE, the find operation will be terminated early.
*
* Note: the callback function must not call other functions on this
* URegularExpression
@ -1282,9 +1413,7 @@ typedef UBool U_CALLCONV URegexFindProgressCallback (
U_CDECL_END
/**
* During find operations, this callback will be invoked after each return from a
* match attempt, specifying the next index at which a match operation is about to be attempted,
* giving the application the opportunity to terminate a long-running find operation.
* Set the find progress callback function for this URegularExpression.
*
* @param regexp The compiled regular expression.
* @param callback A pointer to the user-supplied callback function.
@ -1302,7 +1431,7 @@ uregex_setFindProgressCallback(URegularExpression *regexp,
/**
* Get the callback function for this URegularExpression.
* Get the find progress callback function for this URegularExpression.
*
* @param regexp The compiled regular expression.
* @param callback Out paramater, receives a pointer to the user-supplied

View file

@ -341,8 +341,7 @@ U_CAPI UText * U_EXPORT2
uregex_patternUText(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
(void)status;
return regexp->fPat->patternText();
return regexp->fPat->patternText(*status);
}
@ -479,7 +478,7 @@ uregex_getUText(URegularExpression *regexp2,
if (validateRE(regexp, status, FALSE) == FALSE) {
return dest;
}
return regexp->fMatcher->getInput(dest);
return regexp->fMatcher->getInput(dest, *status);
}
@ -490,8 +489,15 @@ uregex_getUText(URegularExpression *regexp2,
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2
uregex_matches(URegularExpression *regexp2,
int32_t startIndex,
UErrorCode *status) {
int32_t startIndex,
UErrorCode *status) {
return uregex_matches64( regexp2, (int64_t)startIndex, status);
}
U_CAPI UBool U_EXPORT2
uregex_matches64(URegularExpression *regexp2,
int64_t startIndex,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
UBool result = FALSE;
if (validateRE(regexp, status) == FALSE) {
@ -506,7 +512,6 @@ uregex_matches(URegularExpression *regexp2,
}
//------------------------------------------------------------------------------
//
// uregex_lookingAt
@ -516,6 +521,13 @@ U_CAPI UBool U_EXPORT2
uregex_lookingAt(URegularExpression *regexp2,
int32_t startIndex,
UErrorCode *status) {
return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
}
U_CAPI UBool U_EXPORT2
uregex_lookingAt64(URegularExpression *regexp2,
int64_t startIndex,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
UBool result = FALSE;
if (validateRE(regexp, status) == FALSE) {
@ -540,6 +552,13 @@ U_CAPI UBool U_EXPORT2
uregex_find(URegularExpression *regexp2,
int32_t startIndex,
UErrorCode *status) {
return uregex_find64( regexp2, (int64_t)startIndex, status);
}
U_CAPI UBool U_EXPORT2
uregex_find64(URegularExpression *regexp2,
int64_t startIndex,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
UBool result = FALSE;
if (validateRE(regexp, status) == FALSE) {
@ -554,6 +573,7 @@ uregex_find(URegularExpression *regexp2,
return result;
}
//------------------------------------------------------------------------------
//
// uregex_findNext
@ -609,7 +629,7 @@ uregex_group(URegularExpression *regexp2,
if (destCapacity == 0 || regexp->fText != NULL) {
// If preflighting or if we already have the text as UChars,
// this is a little cheaper than going through uregex_groupUText()
// this is a little cheaper than going through uregex_groupUTextDeep()
//
// Pick up the range of characters from the matcher
@ -642,7 +662,7 @@ uregex_group(URegularExpression *regexp2,
}
return fullLength;
} else {
UText *groupText = uregex_groupUText(regexp2, groupNum, NULL, status);
UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
utext_close(groupText);
return result;
@ -657,6 +677,26 @@ uregex_group(URegularExpression *regexp2,
//------------------------------------------------------------------------------
U_CAPI UText * U_EXPORT2
uregex_groupUText(URegularExpression *regexp2,
int32_t groupNum,
UText *dest,
int64_t *groupLength,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, status) == FALSE) {
UErrorCode emptyTextStatus = U_ZERO_ERROR;
return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
}
return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
}
//------------------------------------------------------------------------------
//
// uregex_groupUTextDeep
//
//------------------------------------------------------------------------------
U_CAPI UText * U_EXPORT2
uregex_groupUTextDeep(URegularExpression *regexp2,
int32_t groupNum,
UText *dest,
UErrorCode *status) {
@ -693,7 +733,6 @@ uregex_groupUText(URegularExpression *regexp2,
}
}
//------------------------------------------------------------------------------
//
// uregex_start
@ -703,6 +742,13 @@ U_CAPI int32_t U_EXPORT2
uregex_start(URegularExpression *regexp2,
int32_t groupNum,
UErrorCode *status) {
return (int32_t)uregex_start64( regexp2, groupNum, status);
}
U_CAPI int64_t U_EXPORT2
uregex_start64(URegularExpression *regexp2,
int32_t groupNum,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, status) == FALSE) {
return 0;
@ -711,7 +757,6 @@ uregex_start(URegularExpression *regexp2,
return result;
}
//------------------------------------------------------------------------------
//
// uregex_end
@ -721,6 +766,13 @@ U_CAPI int32_t U_EXPORT2
uregex_end(URegularExpression *regexp2,
int32_t groupNum,
UErrorCode *status) {
return (int32_t)uregex_end64( regexp2, groupNum, status);
}
U_CAPI int64_t U_EXPORT2
uregex_end64(URegularExpression *regexp2,
int32_t groupNum,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, status) == FALSE) {
return 0;
@ -738,6 +790,13 @@ U_CAPI void U_EXPORT2
uregex_reset(URegularExpression *regexp2,
int32_t index,
UErrorCode *status) {
uregex_reset64( regexp2, (int64_t)index, status);
}
U_CAPI void U_EXPORT2
uregex_reset64(URegularExpression *regexp2,
int64_t index,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, status) == FALSE) {
return;
@ -756,6 +815,14 @@ uregex_setRegion(URegularExpression *regexp2,
int32_t regionStart,
int32_t regionLimit,
UErrorCode *status) {
uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
}
U_CAPI void U_EXPORT2
uregex_setRegion64(URegularExpression *regexp2,
int64_t regionStart,
int64_t regionLimit,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, status) == FALSE) {
return;
@ -764,6 +831,24 @@ uregex_setRegion(URegularExpression *regexp2,
}
//------------------------------------------------------------------------------
//
// uregex_setRegionAndStart
//
//------------------------------------------------------------------------------
U_DRAFT void U_EXPORT2
uregex_setRegionAndStart(URegularExpression *regexp2,
int64_t regionStart,
int64_t regionLimit,
int64_t startIndex,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, status) == FALSE) {
return;
}
regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
}
//------------------------------------------------------------------------------
//
// uregex_regionStart
@ -772,6 +857,12 @@ uregex_setRegion(URegularExpression *regexp2,
U_CAPI int32_t U_EXPORT2
uregex_regionStart(const URegularExpression *regexp2,
UErrorCode *status) {
return (int32_t)uregex_regionStart64(regexp2, status);
}
U_CAPI int64_t U_EXPORT2
uregex_regionStart64(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, status) == FALSE) {
return 0;
@ -788,6 +879,12 @@ uregex_regionStart(const URegularExpression *regexp2,
U_CAPI int32_t U_EXPORT2
uregex_regionEnd(const URegularExpression *regexp2,
UErrorCode *status) {
return (int32_t)uregex_regionEnd64(regexp2, status);
}
U_CAPI int64_t U_EXPORT2
uregex_regionEnd64(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, status) == FALSE) {
return 0;
@ -1602,9 +1699,10 @@ uregex_appendTail(URegularExpression *regexp2,
//
U_CAPI UText * U_EXPORT2
uregex_appendTailUText(URegularExpression *regexp2,
UText *dest) {
UText *dest,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
return regexp->fMatcher->appendTail(dest);
return regexp->fMatcher->appendTail(dest, *status);
}

View file

@ -1731,21 +1731,47 @@ static void TestUTextAPI(void) {
/* Capture Group 0, the full match. Should succeed. */
status = U_ZERO_ERROR;
actual = uregex_groupUText(re, 0, NULL, &status);
actual = uregex_groupUTextDeep(re, 0, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_UTEXT(str_abcinteriordef, actual);
utext_close(actual);
/* Capture Group 0 with shallow clone API. Should succeed. */
status = U_ZERO_ERROR;
{
int64_t group_len;
int32_t len16;
UErrorCode shallowStatus = U_ZERO_ERROR;
int64_t nativeIndex;
actual = uregex_groupUText(re, 0, NULL, &group_len, &status);
TEST_ASSERT_SUCCESS(status);
nativeIndex = utext_getNativeIndex(actual);
/* Following returns U_INDEX_OUTOFBOUNDS_ERROR... looks like a bug in ucstrFuncs UTextFuncs [utext.cpp] */
/* len16 = utext_extract(actual, nativeIndex, nativeIndex + group_len, NULL, 0, &shallowStatus); */
len16 = group_len;
UChar *groupChars = (UChar *)malloc(sizeof(UChar)*(len16+1));
utext_extract(actual, nativeIndex, nativeIndex + group_len, groupChars, len16+1, &shallowStatus);
UText groupText = UTEXT_INITIALIZER;
utext_openUChars(&groupText, groupChars, len16, &shallowStatus);
TEST_ASSERT_UTEXT(str_abcinteriordef, &groupText);
utext_close(&groupText);
}
utext_close(actual);
/* Capture group #1. Should succeed. */
status = U_ZERO_ERROR;
actual = uregex_groupUText(re, 1, NULL, &status);
actual = uregex_groupUTextDeep(re, 1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_UTEXT(str_interior, actual);
utext_close(actual);
/* Capture group out of range. Error. */
status = U_ZERO_ERROR;
actual = uregex_groupUText(re, 2, NULL, &status);
actual = uregex_groupUTextDeep(re, 2, NULL, &status);
TEST_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
TEST_ASSERT(utext_nativeLength(actual) == 0);
utext_close(actual);

View file

@ -28,7 +28,6 @@
#define SUPPORT_MUTATING_INPUT_STRING 0
//---------------------------------------------------------------------------
//
// Test class boilerplate
@ -1878,14 +1877,19 @@ void RegexTest::API_Match_UTF8() {
utext_openUnicodeString(&destText, &dest, &status);
UText *result;
//const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
result = matcher->group((UText *)NULL, RegexMatcher::MATCHER_DEST_IS_UTEXT, status);
// Test shallow-clone API
int64_t group_len;
result = matcher->group((UText *)NULL, group_len, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
utext_close(result);
result = matcher->group(&destText, RegexMatcher::MATCHER_DEST_IS_UTEXT, status);
result = matcher->group(0, &destText, group_len, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(result == &destText);
REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
// destText is now immutable, reopen it
utext_close(&destText);
utext_openUnicodeString(&destText, &dest, &status);
result = matcher->group(0, NULL, status);
REGEX_CHECK_STATUS;
@ -2066,14 +2070,14 @@ void RegexTest::API_Match_UTF8() {
unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
m.reset(&s);
for (i=0; ; i+=2) {
for (i=0; ; i+=4) {
if (m.find() == FALSE) {
break;
}
REGEX_ASSERT(m.start(status) == i);
REGEX_ASSERT(m.end(status) == i);
}
REGEX_ASSERT(i==10);
REGEX_ASSERT(i==20);
utext_close(&s);
}
@ -2577,7 +2581,7 @@ const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
m.appendTail(&resultText);
m.appendTail(&resultText, status);
const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
@ -2900,14 +2904,14 @@ void RegexTest::API_Pattern_UTF8() {
//
pat1 = new RegexPattern();
REGEX_ASSERT(pat1->pattern() == "");
REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText());
REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
delete pat1;
regextst_openUTF8FromInvariant(&re1, "(Hello, world)*", -1, &status);
pat1 = RegexPattern::compile(&re1, pe, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText());
REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
delete pat1;
utext_close(&re1);
@ -3090,6 +3094,31 @@ static void set(UVector &vec, int32_t val, UnicodeString index) {
vec.setElementAt(val, idx);
}
static void setInt(UVector &vec, int32_t val, int32_t idx) {
UErrorCode status=U_ZERO_ERROR;
while (vec.size()<idx+1) {vec.addElement(-1, status);}
vec.setElementAt(val, idx);
}
static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
{
UBool couldFind = TRUE;
UTEXT_SETNATIVEINDEX(utext, 0);
int32_t i = 0;
while (i < unistrOffset) {
UChar32 c = UTEXT_NEXT32(utext);
if (c != U_SENTINEL) {
i += U16_LENGTH(c);
} else {
couldFind = FALSE;
break;
}
}
nativeIndex = UTEXT_GETNATIVEINDEX(utext);
return couldFind;
}
void RegexTest::regex_find(const UnicodeString &pattern,
const UnicodeString &flags,
const UnicodeString &inputString,
@ -3112,6 +3141,8 @@ void RegexTest::regex_find(const UnicodeString &pattern,
RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
UVector groupStarts(status);
UVector groupEnds(status);
UVector groupStartsUTF8(status);
UVector groupEndsUTF8(status);
UBool isMatch = FALSE, isUTF8Match = FALSE;
UBool failed = FALSE;
int32_t numFinds;
@ -3120,6 +3151,9 @@ void RegexTest::regex_find(const UnicodeString &pattern,
UBool useLookingAtFunc = FALSE;
int32_t regionStart = -1;
int32_t regionEnd = -1;
int32_t regionStartUTF8 = -1;
int32_t regionEndUTF8 = -1;
//
// Compile the caller's pattern
@ -3278,7 +3312,6 @@ void RegexTest::regex_find(const UnicodeString &pattern,
goto cleanupAndReturn;
}
//
// Configure the matcher according to the flags specified with this test.
//
@ -3307,11 +3340,47 @@ void RegexTest::regex_find(const UnicodeString &pattern,
}
}
//
// Generate native indices for UTF8 versions of region and capture group info
//
if (UTF8Matcher != NULL) {
if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
// Fill out the native index UVector info.
// Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
for (i=0; i<groupStarts.size(); i++) {
int32_t start = groupStarts.elementAti(i);
// -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
if (start >= 0) {
int32_t startUTF8;
if (!utextOffsetToNative(&inputText, start, startUTF8)) {
errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
failed = TRUE;
goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
}
setInt(groupStartsUTF8, startUTF8, i);
}
int32_t end = groupEnds.elementAti(i);
// -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
if (end >= 0) {
int32_t endUTF8;
if (!utextOffsetToNative(&inputText, end, endUTF8)) {
errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
failed = TRUE;
goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
}
setInt(groupEndsUTF8, endUTF8, i);
}
}
}
if (regionStart>=0) {
matcher->region(regionStart, regionEnd, status);
REGEX_CHECK_STATUS_L(line);
if (UTF8Matcher != NULL) {
UTF8Matcher->region(regionStart, regionEnd, status);
UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
REGEX_CHECK_STATUS_L(line);
}
}
@ -3388,28 +3457,30 @@ void RegexTest::regex_find(const UnicodeString &pattern,
REGEX_CHECK_STATUS_L(line);
for (i=0; i<=matcher->groupCount(); i++) {
int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
if (matcher->start(i, status) != expectedStart) {
errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
line, i, expectedStart, matcher->start(i, status));
failed = TRUE;
goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
} else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStart) {
} else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
line, i, expectedStart, UTF8Matcher->start(i, status));
line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
failed = TRUE;
goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
}
int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
if (matcher->end(i, status) != expectedEnd) {
errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
line, i, expectedEnd, matcher->end(i, status));
failed = TRUE;
// Error on end position; keep going; real error is probably yet to come as group
// end positions work from end of the input data towards the front.
} else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEnd) {
} else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
line, i, expectedEnd, UTF8Matcher->end(i, status));
line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
failed = TRUE;
// Error on end position; keep going; real error is probably yet to come as group
// end positions work from end of the input data towards the front.
@ -4757,21 +4828,21 @@ void RegexTest::PreAllocatedUTextCAPI () {
/* Capture Group 0, the full match. Should succeed. */
status = U_ZERO_ERROR;
actual = uregex_groupUText(re, 0, &bufferText, &status);
actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(actual == &bufferText);
REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
/* Capture group #1. Should succeed. */
status = U_ZERO_ERROR;
actual = uregex_groupUText(re, 1, &bufferText, &status);
actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(actual == &bufferText);
REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
/* Capture group out of range. Error. */
status = U_ZERO_ERROR;
actual = uregex_groupUText(re, 2, &bufferText, &status);
actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
REGEX_ASSERT(actual == &bufferText);