diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index ce7d284618d..661334245b2 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -1,6 +1,6 @@ /* ************************************************************************** -* Copyright (C) 2002-2014 International Business Machines Corporation * +* Copyright (C) 2002-2015 International Business Machines Corporation * * and others. All rights reserved. * ************************************************************************** */ @@ -1175,97 +1175,32 @@ UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UE UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { UnicodeString result; - if (U_FAILURE(status)) { + int64_t groupStart = start64(groupNum, status); + int64_t groupEnd = end64(groupNum, status); + if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) { return result; } - UText resultText = UTEXT_INITIALIZER; - utext_openUnicodeString(&resultText, &result, &status); - group(groupNum, &resultText, status); - utext_close(&resultText); + + // Get the group length using a utext_extract preflight. + // UText is actually pretty efficient at this when underlying encoding is UTF-16. + int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status); + if (status != U_BUFFER_OVERFLOW_ERROR) { + return result; + } + + status = U_ZERO_ERROR; + UChar *buf = result.getBuffer(length); + if (buf == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } else { + int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status); + result.releaseBuffer(extractLength); + U_ASSERT(length == extractLength); + } return result; } -// Return deep (mutable) clone -// Technology Preview (as an API), but note that the UnicodeString API is implemented -// using this function. -UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const { - if (U_FAILURE(status)) { - return dest; - } - - if (U_FAILURE(fDeferredStatus)) { - status = fDeferredStatus; - } else if (fMatch == FALSE) { - status = U_REGEX_INVALID_STATE; - } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { - status = U_INDEX_OUTOFBOUNDS_ERROR; - } - if (U_FAILURE(status)) { - return dest; - } - - int64_t s, e; - if (groupNum == 0) { - s = fMatchStart; - e = fMatchEnd; - } else { - int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); - U_ASSERT(groupOffset < fPattern->fFrameSize); - U_ASSERT(groupOffset >= 0); - s = fFrame->fExtra[groupOffset]; - e = fFrame->fExtra[groupOffset+1]; - } - - if (s < 0) { - // A capture group wasn't part of the match - if (dest) { - utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); - return dest; - } else { - return utext_openUChars(NULL, NULL, 0, &status); - } - } - U_ASSERT(s <= e); - - if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { - U_ASSERT(e <= fInputLength); - if (dest) { - utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status); - } else { - UText groupText = UTEXT_INITIALIZER; - utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status); - dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); - utext_close(&groupText); - } - } else { - int32_t len16; - if (UTEXT_USES_U16(fInputText)) { - len16 = (int32_t)(e-s); - } else { - UErrorCode lengthStatus = U_ZERO_ERROR; - len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); - } - UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); - if (groupChars == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return dest; - } - utext_extract(fInputText, s, e, groupChars, len16+1, &status); - - if (dest) { - utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status); - } else { - UText groupText = UTEXT_INITIALIZER; - utext_openUChars(&groupText, groupChars, len16, &status); - dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); - utext_close(&groupText); - } - - uprv_free(groupChars); - } - return dest; -} //-------------------------------------------------------------------------------- // @@ -2001,6 +1936,67 @@ void RegexMatcher::setTrace(UBool state) { +/** + * UText, replace entire contents of the destination UText with a substring of the source UText. + * + * @param src The source UText + * @param dest The destination UText. Must be writable. + * May be NULL, in which case a new UText will be allocated. + * @param start Start index of source substring. + * @param limit Limit index of source substring. + * @param status An error code. + */ +static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) { + if (U_FAILURE(*status)) { + return dest; + } + if (start == limit) { + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status); + return dest; + } else { + return utext_openUChars(NULL, NULL, 0, status); + } + } + int32_t length = utext_extract(src, start, limit, NULL, 0, status); + if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { + return dest; + } + *status = U_ZERO_ERROR; + MaybeStackArray buffer; + if (length >= buffer.getCapacity()) { + UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul. + if (newBuf == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + } + } + utext_extract(src, start, limit, buffer.getAlias(), length+1, status); + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status); + return dest; + } + + // Caller did not provide a prexisting UText. + // Open a new one, and have it adopt the text buffer storage. + if (U_FAILURE(*status)) { + return NULL; + } + int32_t ownedLength = 0; + UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength); + if (ownedBuf == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + UText *result = utext_openUChars(NULL, ownedBuf, length, status); + if (U_FAILURE(*status)) { + uprv_free(ownedBuf); + return NULL; + } + result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT); + return result; +} + + //--------------------------------------------------------------------- // // split @@ -2167,7 +2163,8 @@ int32_t RegexMatcher::split(UText *input, break; } i++; - dest[i] = group(groupNum, dest[i], status); + dest[i] = utext_extract_replace(fInputText, dest[i], + start64(groupNum, status), end64(groupNum, status), &status); } if (nextOutputStringStart == fActiveLimit) { diff --git a/icu4c/source/i18n/unicode/regex.h b/icu4c/source/i18n/unicode/regex.h index 6f805f88e3a..ca5f1a9732b 100644 --- a/icu4c/source/i18n/unicode/regex.h +++ b/icu4c/source/i18n/unicode/regex.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2002-2014, International Business Machines +* Copyright (C) 2002-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: regex.h @@ -896,24 +896,6 @@ public: */ virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const; - /** - * Returns a string containing the text captured by the given group - * during the previous match operation. Group(0) is the entire match. - * - * @param groupNum the capture group number - * @param dest A mutable UText in which the matching text is placed. - * If NULL, a new UText will be created (which may not be mutable). - * @param status A reference to a UErrorCode to receive any errors. - * Possible errors are U_REGEX_INVALID_STATE if no match - * has been attempted or the last match failed. - * @return A string containing the matched input text. If a pre-allocated UText - * was provided, it will always be used and returned. - * - * @internal ICU 4.4 technology preview - */ - virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const; - - /** * Returns the index in the input string of the start of the text matched * during the previous match operation. diff --git a/icu4c/source/i18n/unicode/uregex.h b/icu4c/source/i18n/unicode/uregex.h index a85ba1295f0..cb7e08d82ad 100644 --- a/icu4c/source/i18n/unicode/uregex.h +++ b/icu4c/source/i18n/unicode/uregex.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2004-2013, International Business Machines +* Copyright (C) 2004-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: uregex.h @@ -659,31 +659,6 @@ uregex_groupUText(URegularExpression *regexp, int64_t *groupLength, UErrorCode *status); -#ifndef U_HIDE_INTERNAL_API -/** Extract the string for the specified matching expression or subexpression. - * Group #0 is the complete string of matched text. - * Group #1 is the text matched by the first set of capturing parentheses. - * - * @param regexp The compiled regular expression. - * @param groupNum The capture group to extract. Group 0 is the complete - * match. The value of this parameter must be - * less than or equal to the number of capture groups in - * the pattern. - * @param dest Mutable UText to receive the matching string data. - * If NULL, a new UText will be created (which may not be mutable). - * @param status A reference to a UErrorCode to receive any errors. - * @return The matching string data. If a pre-allocated UText was provided, - * it will always be used and returned. - * - * @internal ICU 4.4 technology preview - */ -U_INTERNAL UText * U_EXPORT2 -uregex_groupUTextDeep(URegularExpression *regexp, - int32_t groupNum, - UText *dest, - UErrorCode *status); -#endif /* U_HIDE_INTERNAL_API */ - /** * Returns the index in the input string of the start of the text matched by the * specified capture group during the previous match operation. Return -1 if diff --git a/icu4c/source/i18n/uregex.cpp b/icu4c/source/i18n/uregex.cpp index 01951234b9c..1f110f2c7b4 100644 --- a/icu4c/source/i18n/uregex.cpp +++ b/icu4c/source/i18n/uregex.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 2004-2014, International Business Machines +* Copyright (C) 2004-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: uregex.cpp @@ -647,7 +647,7 @@ uregex_group(URegularExpression *regexp2, if (destCapacity == 0 || regexp->fText != NULL) { // If preflighting or if we already have the text as UChars, - // this is a little cheaper than going through uregex_groupUTextDeep() + // this is a little cheaper than extracting from the UText // // Pick up the range of characters from the matcher @@ -680,14 +680,18 @@ uregex_group(URegularExpression *regexp2, } return fullLength; } else { - int32_t result = 0; - UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status); - if (U_SUCCESS(*status)) { - result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status); + int64_t start = regexp->fMatcher->start64(groupNum, *status); + int64_t limit = regexp->fMatcher->end64(groupNum, *status); + if (U_FAILURE(*status)) { + return 0; } - utext_close(groupText); - return result; + // Note edge cases: + // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result. + // Zero Length Match: start == end. + int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status); + return length; } + } @@ -711,49 +715,6 @@ uregex_groupUText(URegularExpression *regexp2, return regexp->fMatcher->group(groupNum, dest, *groupLength, *status); } -//------------------------------------------------------------------------------ -// -// uregex_groupUTextDeep -// -//------------------------------------------------------------------------------ -U_CAPI UText * U_EXPORT2 -uregex_groupUTextDeep(URegularExpression *regexp2, - int32_t groupNum, - UText *dest, - UErrorCode *status) { - RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, TRUE, status) == FALSE) { - UErrorCode emptyTextStatus = U_ZERO_ERROR; - return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); - } - - if (regexp->fText != NULL) { - // - // Pick up the range of characters from the matcher - // and use our already-extracted characters - // - int32_t startIx = regexp->fMatcher->start(groupNum, *status); - int32_t endIx = regexp->fMatcher->end (groupNum, *status); - if (U_FAILURE(*status)) { - UErrorCode emptyTextStatus = U_ZERO_ERROR; - return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); - } - - if (dest) { - utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status); - } else { - UText groupText = UTEXT_INITIALIZER; - utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status); - dest = utext_clone(NULL, &groupText, TRUE, FALSE, status); - utext_close(&groupText); - } - - return dest; - } else { - return regexp->fMatcher->group(groupNum, dest, *status); - } -} - //------------------------------------------------------------------------------ // // uregex_start diff --git a/icu4c/source/test/cintltst/reapits.c b/icu4c/source/test/cintltst/reapits.c index 61751c51c61..32f7f6bd211 100644 --- a/icu4c/source/test/cintltst/reapits.c +++ b/icu4c/source/test/cintltst/reapits.c @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2004-2014, International Business Machines Corporation and + * Copyright (c) 2004-2015, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /******************************************************************************** @@ -1754,16 +1754,14 @@ static void TestUTextAPI(void) { } /* - * group() + * groupUText() */ { UChar text1[80]; UText *actual; UBool result; - - const char str_abcinteriordef[] = { 0x61, 0x62, 0x63, 0x20, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x69, 0x6f, 0x72, 0x20, 0x64, 0x65, 0x66, 0x00 }; /* abc interior def */ - const char str_interior[] = { 0x20, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x69, 0x6f, 0x72, 0x20, 0x00 }; /* ' interior ' */ - + int64_t groupLen = 0; + UChar groupBuf[20]; u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1)); @@ -1775,58 +1773,38 @@ static void TestUTextAPI(void) { result = uregex_find(re, 0, &status); TEST_ASSERT(result==TRUE); - /* Capture Group 0, the full match. Should succeed. */ - status = U_ZERO_ERROR; - actual = uregex_groupUTextDeep(re, 0, NULL, &status); - TEST_ASSERT_SUCCESS(status); - TEST_ASSERT_UTEXT(str_abcinteriordef, actual); - utext_close(actual); - /* Capture Group 0 with shallow clone API. Should succeed. */ status = U_ZERO_ERROR; - { - int64_t group_len; - int32_t len16; - UErrorCode shallowStatus = U_ZERO_ERROR; - int64_t nativeIndex; - UChar *groupChars; - UText groupText = UTEXT_INITIALIZER; + actual = uregex_groupUText(re, 0, NULL, &groupLen, &status); + TEST_ASSERT_SUCCESS(status); - actual = uregex_groupUText(re, 0, NULL, &group_len, &status); - TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(utext_getNativeIndex(actual) == 6); /* index of "abc " within "noise abc ..." */ + TEST_ASSERT(groupLen == 16); /* length of "abc interior def" */ + utext_extract(actual, 6 /*start index */, 6+16 /*limit index*/, groupBuf, sizeof(groupBuf), &status); - nativeIndex = utext_getNativeIndex(actual); - /* Following returns U_INDEX_OUTOFBOUNDS_ERROR... looks like a bug in ucstrFuncs UTextFuncs [utext.cpp] */ - /* len16 = utext_extract(actual, nativeIndex, nativeIndex + group_len, NULL, 0, &shallowStatus); */ - len16 = (int32_t)group_len; - - groupChars = (UChar *)malloc(sizeof(UChar)*(len16+1)); - utext_extract(actual, nativeIndex, nativeIndex + group_len, groupChars, len16+1, &shallowStatus); - - utext_openUChars(&groupText, groupChars, len16, &shallowStatus); - - TEST_ASSERT_UTEXT(str_abcinteriordef, &groupText); - utext_close(&groupText); - free(groupChars); - } + TEST_ASSERT_STRING("abc interior def", groupBuf, TRUE); utext_close(actual); /* Capture group #1. Should succeed. */ status = U_ZERO_ERROR; - actual = uregex_groupUTextDeep(re, 1, NULL, &status); + + actual = uregex_groupUText(re, 1, NULL, &groupLen, &status); TEST_ASSERT_SUCCESS(status); - TEST_ASSERT_UTEXT(str_interior, actual); + TEST_ASSERT(9 == utext_getNativeIndex(actual)); /* index of " interior " within "noise abc interior def ... " */ + /* (within the string text1) */ + TEST_ASSERT(10 == groupLen); /* length of " interior " */ + utext_extract(actual, 9 /*start index*/, 9+10 /*limit index*/, groupBuf, sizeof(groupBuf), &status); + TEST_ASSERT_STRING(" interior ", groupBuf, TRUE); + utext_close(actual); /* Capture group out of range. Error. */ status = U_ZERO_ERROR; - actual = uregex_groupUTextDeep(re, 2, NULL, &status); + actual = uregex_groupUText(re, 2, NULL, &groupLen, &status); TEST_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); - TEST_ASSERT(utext_nativeLength(actual) == 0); utext_close(actual); uregex_close(re); - } /* diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index 3ca29bbb902..ee287f2f32b 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2002-2014, International Business Machines Corporation and + * Copyright (c) 2002-2015, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -38,6 +38,7 @@ #include #include #include +#include "cmemory.h" #include "cstring.h" #include "uinvchar.h" @@ -239,7 +240,12 @@ if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status= #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} -#define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};} +// expected: const char * , restricted to invariant characters. +// actual: const UnicodeString & +#define REGEX_ASSERT_UNISTR(expected, actual) { \ + if (UnicodeString(expected, -1, US_INV) != (actual)) { \ + errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \ + __FILE__, __LINE__, expected, extractToAssertBuf(actual));};} static UBool testUTextEqual(UText *uta, UText *utb) { @@ -2050,47 +2056,72 @@ void RegexTest::API_Match_UTF8() { utext_close(&destText); utext_openUnicodeString(&destText, &dest, &status); - result = matcher->group(0, NULL, status); + int64_t length; + result = matcher->group(0, NULL, length, status); REGEX_CHECK_STATUS; REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); utext_close(result); - result = matcher->group(0, &destText, status); + result = matcher->group(0, &destText, length, status); REGEX_CHECK_STATUS; REGEX_ASSERT(result == &destText); - REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 0); + REGEX_ASSERT(length == 10); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); - result = matcher->group(1, NULL, status); + // Capture Group 1 == "234567" + result = matcher->group(1, NULL, length, status); REGEX_CHECK_STATUS; - const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */ - REGEX_ASSERT_UTEXT_UTF8(str_234567, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 2); + REGEX_ASSERT(length == 6); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); utext_close(result); - result = matcher->group(1, &destText, status); + + result = matcher->group(1, &destText, length, status); REGEX_CHECK_STATUS; REGEX_ASSERT(result == &destText); - REGEX_ASSERT_UTEXT_UTF8(str_234567, result); - - result = matcher->group(2, NULL, status); - REGEX_CHECK_STATUS; - const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */ - REGEX_ASSERT_UTEXT_UTF8(str_45, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 2); + REGEX_ASSERT(length == 6); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); utext_close(result); - result = matcher->group(2, &destText, status); - REGEX_CHECK_STATUS; - REGEX_ASSERT(result == &destText); - REGEX_ASSERT_UTEXT_UTF8(str_45, result); - result = matcher->group(3, NULL, status); + // Capture Group 2 == "45" + result = matcher->group(2, NULL, length, status); REGEX_CHECK_STATUS; - const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */ - REGEX_ASSERT_UTEXT_UTF8(str_89, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 4); + REGEX_ASSERT(length == 2); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); utext_close(result); - result = matcher->group(3, &destText, status); + + result = matcher->group(2, &destText, length, status); REGEX_CHECK_STATUS; REGEX_ASSERT(result == &destText); - REGEX_ASSERT_UTEXT_UTF8(str_89, result); + REGEX_ASSERT(utext_getNativeIndex(result) == 4); + REGEX_ASSERT(length == 2); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); + utext_close(result); + // Capture Group 3 == "89" + result = matcher->group(3, NULL, length, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(utext_getNativeIndex(result) == 8); + REGEX_ASSERT(length == 2); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); + utext_close(result); + + result = matcher->group(3, &destText, length, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT(utext_getNativeIndex(result) == 8); + REGEX_ASSERT(length == 2); + REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); + utext_close(result); + + // Capture Group number out of range. + status = U_ZERO_ERROR; REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); + status = U_ZERO_ERROR; REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); + status = U_ZERO_ERROR; matcher->reset(); REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); @@ -3068,6 +3099,37 @@ void RegexTest::API_Pattern_UTF8() { delete pat1; + // + // split of a UText based string, with library allocating output UTexts. + // + { + status = U_ZERO_ERROR; + RegexMatcher matcher(UnicodeString("(:)"), 0, status); + UnicodeString stringToSplit("first:second:third"); + UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status); + REGEX_CHECK_STATUS; + + UText *splits[10] = {NULL}; + int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(numFields == 5); + REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]); + REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]); + REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]); + REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]); + REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]); + REGEX_ASSERT(splits[5] == NULL); + + for (int i=0; ipattern(),"(Hello, world)*"); + REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern()); REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status)); delete pat1; @@ -4995,7 +5057,11 @@ void RegexTest::PreAllocatedUTextCAPI () { UChar text1[80]; UText *actual; UBool result; - u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2); + int64_t length = 0; + + u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1)); + // 012345678901234567890123456789012345678901234567 + // 0 1 2 3 4 status = U_ZERO_ERROR; re = uregex_openC("abc(.*?)def", 0, NULL, &status); @@ -5005,26 +5071,29 @@ void RegexTest::PreAllocatedUTextCAPI () { result = uregex_find(re, 0, &status); REGEX_ASSERT(result==TRUE); - /* Capture Group 0, the full match. Should succeed. */ + /* Capture Group 0, the full match. Should succeed. "abc interior def" */ status = U_ZERO_ERROR; - actual = uregex_groupUTextDeep(re, 0, &bufferText, &status); + actual = uregex_groupUText(re, 0, &bufferText, &length, &status); REGEX_CHECK_STATUS; REGEX_ASSERT(actual == &bufferText); - REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual); + REGEX_ASSERT(utext_getNativeIndex(actual) == 6); + REGEX_ASSERT(length == 16); + REGEX_ASSERT(utext_nativeLength(actual) == 47); - /* Capture group #1. Should succeed. */ + /* Capture group #1. Should succeed, matching " interior ". */ status = U_ZERO_ERROR; - actual = uregex_groupUTextDeep(re, 1, &bufferText, &status); + actual = uregex_groupUText(re, 1, &bufferText, &length, &status); REGEX_CHECK_STATUS; REGEX_ASSERT(actual == &bufferText); - REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual); + REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior " + REGEX_ASSERT(length == 10); + REGEX_ASSERT(utext_nativeLength(actual) == 47); /* Capture group out of range. Error. */ status = U_ZERO_ERROR; - actual = uregex_groupUTextDeep(re, 2, &bufferText, &status); + actual = uregex_groupUText(re, 2, &bufferText, &length, &status); REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); REGEX_ASSERT(actual == &bufferText); - uregex_close(re); } @@ -5037,10 +5106,12 @@ void RegexTest::PreAllocatedUTextCAPI () { UChar text2[80]; UText replText = UTEXT_INITIALIZER; UText *result; + status = U_ZERO_ERROR; + utext_openUnicodeString(&bufferText, &buffer, &status); status = U_ZERO_ERROR; - u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); - u_uastrncpy(text2, "No match here.", sizeof(text2)/2); + u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1)); + u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2); regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); re = uregex_openC("x(.*?)x", 0, NULL, &status); @@ -5048,7 +5119,9 @@ void RegexTest::PreAllocatedUTextCAPI () { /* Normal case, with match */ uregex_setText(re, text1, -1, &status); + REGEX_CHECK_STATUS; utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); + REGEX_CHECK_STATUS; result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); REGEX_CHECK_STATUS; REGEX_ASSERT(result == &bufferText);