ICU-11469 Regular Expressions, remove old tech preview functions.

X-SVN-Rev: 36953
2025-04-13 08:53:20 +00:00 · 2015-01-14 00:03:29 +00:00 · 2015-01-14 00:03:29 +00:00 · 22c8c94d14
commit 22c8c94d14
parent 069313c959
6 changed files with 226 additions and 260 deletions
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -1,6 +1,6 @@
 /*
 **************************************************************************
-*   Copyright (C) 2002-2014 International Business Machines Corporation  *
+*   Copyright (C) 2002-2015 International Business Machines Corporation  *
 *   and others. All rights reserved.                                     *
 **************************************************************************
 */
@ -1175,97 +1175,32 @@ UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UE

 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
    UnicodeString result;
-    if (U_FAILURE(status)) {
+    int64_t groupStart = start64(groupNum, status);
+    int64_t groupEnd = end64(groupNum, status);
+    if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
        return result;
    }
-    UText resultText = UTEXT_INITIALIZER;
-    utext_openUnicodeString(&resultText, &result, &status);
-    group(groupNum, &resultText, status);
-    utext_close(&resultText);
+
+    // Get the group length using a utext_extract preflight.
+    //    UText is actually pretty efficient at this when underlying encoding is UTF-16.
+    int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
+    if (status != U_BUFFER_OVERFLOW_ERROR) {
+        return result;
+    }
+
+    status = U_ZERO_ERROR;
+    UChar *buf = result.getBuffer(length);
+    if (buf == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+    } else {
+        int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
+        result.releaseBuffer(extractLength);
+        U_ASSERT(length == extractLength);
+    }
    return result;
 }


-//  Return deep (mutable) clone
-//      Technology Preview (as an API), but note that the UnicodeString API is implemented
-//      using this function.
-UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
-    if (U_FAILURE(status)) {
-        return dest;
-    }
-
-    if (U_FAILURE(fDeferredStatus)) {
-        status = fDeferredStatus;
-    } else if (fMatch == FALSE) {
-        status = U_REGEX_INVALID_STATE;
-    } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
-        status = U_INDEX_OUTOFBOUNDS_ERROR;
-    }
-    if (U_FAILURE(status)) {
-        return dest;
-    }
-
-    int64_t s, e;
-    if (groupNum == 0) {
-        s = fMatchStart;
-        e = fMatchEnd;
-    } else {
-        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
-        U_ASSERT(groupOffset < fPattern->fFrameSize);
-        U_ASSERT(groupOffset >= 0);
-        s = fFrame->fExtra[groupOffset];
-        e = fFrame->fExtra[groupOffset+1];
-    }
-
-    if (s < 0) {
-        // A capture group wasn't part of the match
-        if (dest) {
-            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
-            return dest;
-        } else {
-            return utext_openUChars(NULL, NULL, 0, &status);
-        }
-    }
-    U_ASSERT(s <= e);
-
-    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
-        U_ASSERT(e <= fInputLength);
-        if (dest) {
-            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
-        } else {
-            UText groupText = UTEXT_INITIALIZER;
-            utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
-            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
-            utext_close(&groupText);
-        }
-    } else {
-        int32_t len16;
-        if (UTEXT_USES_U16(fInputText)) {
-            len16 = (int32_t)(e-s);
-        } else {
-            UErrorCode lengthStatus = U_ZERO_ERROR;
-            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
-        }
-        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
-        if (groupChars == NULL) {
-            status = U_MEMORY_ALLOCATION_ERROR;
-            return dest;
-        }
-        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
-
-        if (dest) {
-            utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
-        } else {
-            UText groupText = UTEXT_INITIALIZER;
-            utext_openUChars(&groupText, groupChars, len16, &status);
-            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
-            utext_close(&groupText);
-        }
-
-        uprv_free(groupChars);
-    }
-    return dest;
-}

 //--------------------------------------------------------------------------------
 //
@ -2001,6 +1936,67 @@ void RegexMatcher::setTrace(UBool state) {



+/**
+  *  UText, replace entire contents of the destination UText with a substring of the source UText.
+  *
+  *     @param src    The source UText
+  *     @param dest   The destination UText. Must be writable.
+  *                   May be NULL, in which case a new UText will be allocated.
+  *     @param start  Start index of source substring.
+  *     @param limit  Limit index of source substring.
+  *     @param status An error code.
+  */
+static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
+    if (U_FAILURE(*status)) {
+        return dest;
+    }
+    if (start == limit) {
+        if (dest) {
+            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
+            return dest;
+        } else {
+            return utext_openUChars(NULL, NULL, 0, status);
+        }
+    }
+    int32_t length = utext_extract(src, start, limit, NULL, 0, status);
+    if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
+        return dest;
+    }
+    *status = U_ZERO_ERROR;
+    MaybeStackArray<UChar, 40> buffer;
+    if (length >= buffer.getCapacity()) {
+        UChar *newBuf = buffer.resize(length+1);   // Leave space for terminating Nul.
+        if (newBuf == NULL) {
+            *status = U_MEMORY_ALLOCATION_ERROR;
+        }
+    }
+    utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
+    if (dest) {
+        utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
+        return dest;
+    }
+
+    // Caller did not provide a prexisting UText.
+    // Open a new one, and have it adopt the text buffer storage.
+    if (U_FAILURE(*status)) {
+        return NULL;
+    }
+    int32_t ownedLength = 0;
+    UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
+    if (ownedBuf == NULL) {
+        *status = U_MEMORY_ALLOCATION_ERROR;
+        return NULL;
+    }
+    UText *result = utext_openUChars(NULL, ownedBuf, length, status);
+    if (U_FAILURE(*status)) {
+        uprv_free(ownedBuf);
+        return NULL;
+    }
+    result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
+    return result;
+}
+
+
 //---------------------------------------------------------------------
 //
 //   split
@ -2167,7 +2163,8 @@ int32_t  RegexMatcher::split(UText *input,
                    break;
                }
                i++;
-                dest[i] = group(groupNum, dest[i], status);
+                dest[i] = utext_extract_replace(fInputText, dest[i], 
+                                               start64(groupNum, status), end64(groupNum, status), &status);
            }

            if (nextOutputStringStart == fActiveLimit) {
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 2002-2014, International Business Machines
+*   Copyright (C) 2002-2015, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  regex.h
@ -896,24 +896,6 @@ public:
    */
    virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;

-   /**
-    *   Returns a string containing the text captured by the given group
-    *   during the previous match operation.  Group(0) is the entire match.
-    *
-    *   @param   groupNum    the capture group number
-    *   @param   dest        A mutable UText in which the matching text is placed.
-    *                        If NULL, a new UText will be created (which may not be mutable).
-    *   @param   status      A reference to a UErrorCode to receive any errors.
-    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
-    *                        has been attempted or the last match failed.
-    *   @return  A string containing the matched input text. If a pre-allocated UText
-    *            was provided, it will always be used and returned.
-    *
-    *   @internal ICU 4.4 technology preview
-    */
-    virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
-
-
   /**
    *   Returns the index in the input string of the start of the text matched
    *   during the previous match operation.
--- a/icu4c/source/i18n/unicode/uregex.h
+++ b/icu4c/source/i18n/unicode/uregex.h
@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 2004-2013, International Business Machines
+*   Copyright (C) 2004-2015, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  uregex.h
@ -659,31 +659,6 @@ uregex_groupUText(URegularExpression *regexp,
                  int64_t            *groupLength,
                  UErrorCode         *status);

-#ifndef U_HIDE_INTERNAL_API
-/** Extract the string for the specified matching expression or subexpression.
-  * Group #0 is the complete string of matched text.
-  * Group #1 is the text matched by the first set of capturing parentheses.
-  *
-  *   @param   regexp       The compiled regular expression.
-  *   @param   groupNum     The capture group to extract.  Group 0 is the complete
-  *                         match.  The value of this parameter must be
-  *                         less than or equal to the number of capture groups in
-  *                         the pattern.
-  *   @param   dest         Mutable UText to receive the matching string data.
-  *                         If NULL, a new UText will be created (which may not be mutable).
-  *   @param   status       A reference to a UErrorCode to receive any errors.
-  *   @return               The matching string data. If a pre-allocated UText was provided,
-  *                          it will always be used and returned.
-  *
-  *   @internal ICU 4.4 technology preview
-  */
-U_INTERNAL UText * U_EXPORT2 
-uregex_groupUTextDeep(URegularExpression *regexp,
-                  int32_t             groupNum,
-                  UText              *dest,
-                  UErrorCode         *status);
-#endif  /* U_HIDE_INTERNAL_API */
-
 /**
  *   Returns the index in the input string of the start of the text matched by the
  *   specified capture group during the previous match operation.  Return -1 if
--- a/icu4c/source/i18n/uregex.cpp
+++ b/icu4c/source/i18n/uregex.cpp
@ -1,6 +1,6 @@
 /*
 *******************************************************************************
-*   Copyright (C) 2004-2014, International Business Machines
+*   Copyright (C) 2004-2015, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *******************************************************************************
 *   file name:  uregex.cpp
@ -647,7 +647,7 @@ uregex_group(URegularExpression *regexp2,

    if (destCapacity == 0 || regexp->fText != NULL) {
        // If preflighting or if we already have the text as UChars,
-        // this is a little cheaper than going through uregex_groupUTextDeep()
+        // this is a little cheaper than extracting from the UText

        //
        // Pick up the range of characters from the matcher
@ -680,14 +680,18 @@ uregex_group(URegularExpression *regexp2,
        }
        return fullLength;
    } else {
-        int32_t result = 0;
-        UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
-        if (U_SUCCESS(*status)) {
-            result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
+        int64_t  start = regexp->fMatcher->start64(groupNum, *status);
+        int64_t  limit = regexp->fMatcher->end64(groupNum, *status);
+        if (U_FAILURE(*status)) {
+            return 0;
        }
-        utext_close(groupText);
-        return result;
+        // Note edge cases:
+        //   Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result.
+        //   Zero Length Match: start == end.
+        int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status);
+        return length;
    }
+
 }


@ -711,49 +715,6 @@ uregex_groupUText(URegularExpression *regexp2,
    return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
 }

-//------------------------------------------------------------------------------
-//
-//    uregex_groupUTextDeep
-//
-//------------------------------------------------------------------------------
-U_CAPI UText * U_EXPORT2
-uregex_groupUTextDeep(URegularExpression *regexp2,
-                  int32_t             groupNum,
-                  UText              *dest,
-                  UErrorCode         *status)  {
-    RegularExpression *regexp = (RegularExpression*)regexp2;
-    if (validateRE(regexp, TRUE, status) == FALSE) {
-        UErrorCode emptyTextStatus = U_ZERO_ERROR;
-        return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
-    }
-
-    if (regexp->fText != NULL) {
-        //
-        // Pick up the range of characters from the matcher
-        // and use our already-extracted characters
-        //
-        int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
-        int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
-        if (U_FAILURE(*status)) {
-            UErrorCode emptyTextStatus = U_ZERO_ERROR;
-            return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
-        }
-
-        if (dest) {
-            utext_replace(dest, 0, utext_nativeLength(dest), &regexp->fText[startIx], endIx - startIx, status);
-        } else {
-            UText groupText = UTEXT_INITIALIZER;
-            utext_openUChars(&groupText, &regexp->fText[startIx], endIx - startIx, status);
-            dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
-            utext_close(&groupText);
-        }
-
-        return dest;
-    } else {
-        return regexp->fMatcher->group(groupNum, dest, *status);
-    }
-}
-
 //------------------------------------------------------------------------------
 //
 //    uregex_start
--- a/icu4c/source/test/cintltst/reapits.c
+++ b/icu4c/source/test/cintltst/reapits.c
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT: 
- * Copyright (c) 2004-2014, International Business Machines Corporation and
+ * Copyright (c) 2004-2015, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/
 /********************************************************************************
@ -1754,16 +1754,14 @@ static void TestUTextAPI(void) {
    }

    /*
-     *  group()
+     *  groupUText()
     */
    {
        UChar    text1[80];
        UText   *actual;
        UBool    result;
-
-        const char str_abcinteriordef[] = { 0x61, 0x62, 0x63, 0x20, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x69, 0x6f, 0x72, 0x20, 0x64, 0x65, 0x66, 0x00 }; /* abc interior def */
-        const char str_interior[] = { 0x20, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x69, 0x6f, 0x72, 0x20, 0x00 }; /* ' interior ' */
-        
+        int64_t  groupLen = 0;
+        UChar    groupBuf[20];

        u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));

@ -1775,58 +1773,38 @@ static void TestUTextAPI(void) {
        result = uregex_find(re, 0, &status);
        TEST_ASSERT(result==TRUE);

-        /*  Capture Group 0, the full match.  Should succeed.  */
-        status = U_ZERO_ERROR;
-        actual = uregex_groupUTextDeep(re, 0, NULL, &status);
-        TEST_ASSERT_SUCCESS(status);
-        TEST_ASSERT_UTEXT(str_abcinteriordef, actual);
-        utext_close(actual);
-
        /*  Capture Group 0 with shallow clone API.  Should succeed.  */
        status = U_ZERO_ERROR;
-        {
-            int64_t      group_len;
-            int32_t      len16;
-            UErrorCode   shallowStatus = U_ZERO_ERROR;
-            int64_t      nativeIndex;
-            UChar *groupChars;
-            UText groupText = UTEXT_INITIALIZER;
+        actual = uregex_groupUText(re, 0, NULL, &groupLen, &status);
+        TEST_ASSERT_SUCCESS(status);

-            actual = uregex_groupUText(re, 0, NULL, &group_len, &status);
-            TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(utext_getNativeIndex(actual) == 6);  /* index of "abc " within "noise abc ..." */
+        TEST_ASSERT(groupLen == 16);   /* length of "abc interior def"  */
+        utext_extract(actual, 6 /*start index */, 6+16 /*limit index*/, groupBuf, sizeof(groupBuf), &status);

-            nativeIndex = utext_getNativeIndex(actual);
-            /*  Following returns U_INDEX_OUTOFBOUNDS_ERROR... looks like a bug in ucstrFuncs UTextFuncs [utext.cpp]  */
-            /*  len16 = utext_extract(actual, nativeIndex, nativeIndex + group_len, NULL, 0, &shallowStatus);  */
-            len16 = (int32_t)group_len;
-            
-            groupChars = (UChar *)malloc(sizeof(UChar)*(len16+1));
-            utext_extract(actual, nativeIndex, nativeIndex + group_len, groupChars, len16+1, &shallowStatus);
-
-            utext_openUChars(&groupText, groupChars, len16, &shallowStatus);
-            
-            TEST_ASSERT_UTEXT(str_abcinteriordef, &groupText);
-            utext_close(&groupText);
-            free(groupChars);
-        }
+        TEST_ASSERT_STRING("abc interior def", groupBuf, TRUE);
        utext_close(actual);

        /*  Capture group #1.  Should succeed. */
        status = U_ZERO_ERROR;
-        actual = uregex_groupUTextDeep(re, 1, NULL, &status);
+
+        actual = uregex_groupUText(re, 1, NULL, &groupLen, &status);
        TEST_ASSERT_SUCCESS(status);
-        TEST_ASSERT_UTEXT(str_interior, actual);
+        TEST_ASSERT(9 == utext_getNativeIndex(actual));    /* index of " interior " within "noise abc interior def ... " */
+                                                           /*    (within the string text1)           */
+        TEST_ASSERT(10 == groupLen);                       /* length of " interior " */
+        utext_extract(actual, 9 /*start index*/, 9+10 /*limit index*/, groupBuf, sizeof(groupBuf), &status);
+        TEST_ASSERT_STRING(" interior ", groupBuf, TRUE);
+
        utext_close(actual);

        /*  Capture group out of range.  Error. */
        status = U_ZERO_ERROR;
-        actual = uregex_groupUTextDeep(re, 2, NULL, &status);
+        actual = uregex_groupUText(re, 2, NULL, &groupLen, &status);
        TEST_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
-        TEST_ASSERT(utext_nativeLength(actual) == 0);
        utext_close(actual);

        uregex_close(re);
-
    }
    
    /*
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT:
- * Copyright (c) 2002-2014, International Business Machines Corporation and
+ * Copyright (c) 2002-2015, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/

@ -38,6 +38,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
+#include "cmemory.h"
 #include "cstring.h"
 #include "uinvchar.h"

@ -239,7 +240,12 @@ if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=
 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
    errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}

-#define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
+// expected: const char * , restricted to invariant characters.
+// actual: const UnicodeString &
+#define REGEX_ASSERT_UNISTR(expected, actual) { \
+    if (UnicodeString(expected, -1, US_INV) != (actual)) { \
+        errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
+                __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}


 static UBool testUTextEqual(UText *uta, UText *utb) {
@ -2050,47 +2056,72 @@ void RegexTest::API_Match_UTF8() {
        utext_close(&destText);
        utext_openUnicodeString(&destText, &dest, &status);

-        result = matcher->group(0, NULL, status);
+        int64_t length;
+        result = matcher->group(0, NULL, length, status);
        REGEX_CHECK_STATUS;
        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
        utext_close(result);
-        result = matcher->group(0, &destText, status);
+        result = matcher->group(0, &destText, length, status);
        REGEX_CHECK_STATUS;
        REGEX_ASSERT(result == &destText);
-        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
+        REGEX_ASSERT(utext_getNativeIndex(result) == 0);
+        REGEX_ASSERT(length == 10);
+        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);

-        result = matcher->group(1, NULL, status);
+        // Capture Group 1 == "234567"
+        result = matcher->group(1, NULL, length, status);
        REGEX_CHECK_STATUS;
-        const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
-        REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
+        REGEX_ASSERT(utext_getNativeIndex(result) == 2);
+        REGEX_ASSERT(length == 6);
+        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
        utext_close(result);
-        result = matcher->group(1, &destText, status);
+
+        result = matcher->group(1, &destText, length, status);
        REGEX_CHECK_STATUS;
        REGEX_ASSERT(result == &destText);
-        REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
-
-        result = matcher->group(2, NULL, status);
-        REGEX_CHECK_STATUS;
-        const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
-        REGEX_ASSERT_UTEXT_UTF8(str_45, result);
+        REGEX_ASSERT(utext_getNativeIndex(result) == 2);
+        REGEX_ASSERT(length == 6);
+        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
        utext_close(result);
-        result = matcher->group(2, &destText, status);
-        REGEX_CHECK_STATUS;
-        REGEX_ASSERT(result == &destText);
-        REGEX_ASSERT_UTEXT_UTF8(str_45, result);

-        result = matcher->group(3, NULL, status);
+        // Capture Group 2 == "45"
+        result = matcher->group(2, NULL, length, status);
        REGEX_CHECK_STATUS;
-        const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
-        REGEX_ASSERT_UTEXT_UTF8(str_89, result);
+        REGEX_ASSERT(utext_getNativeIndex(result) == 4);
+        REGEX_ASSERT(length == 2);
+        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
        utext_close(result);
-        result = matcher->group(3, &destText, status);
+
+        result = matcher->group(2, &destText, length, status);
        REGEX_CHECK_STATUS;
        REGEX_ASSERT(result == &destText);
-        REGEX_ASSERT_UTEXT_UTF8(str_89, result);
+        REGEX_ASSERT(utext_getNativeIndex(result) == 4);
+        REGEX_ASSERT(length == 2);
+        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
+        utext_close(result);

+        // Capture Group 3 == "89"
+        result = matcher->group(3, NULL, length, status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(utext_getNativeIndex(result) == 8);
+        REGEX_ASSERT(length == 2);
+        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
+        utext_close(result);
+
+        result = matcher->group(3, &destText, length, status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(result == &destText);
+        REGEX_ASSERT(utext_getNativeIndex(result) == 8);
+        REGEX_ASSERT(length == 2);
+        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
+        utext_close(result);
+
+        // Capture Group number out of range.
+        status = U_ZERO_ERROR;
        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
+        status = U_ZERO_ERROR;
        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
+        status = U_ZERO_ERROR;
        matcher->reset();
        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);

@ -3068,6 +3099,37 @@ void RegexTest::API_Pattern_UTF8() {
    delete pat1;


+    //
+    // split of a UText based string, with library allocating output UTexts.
+    //
+    {
+        status = U_ZERO_ERROR;
+        RegexMatcher matcher(UnicodeString("(:)"), 0, status);
+        UnicodeString stringToSplit("first:second:third");
+        UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
+        REGEX_CHECK_STATUS;
+        
+        UText *splits[10] = {NULL};
+        int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(numFields == 5);
+        REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
+        REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
+        REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
+        REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
+        REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
+        REGEX_ASSERT(splits[5] == NULL);
+
+        for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
+            if (splits[i]) {
+                utext_close(splits[i]);
+                splits[i] = NULL;
+            }
+        }
+        utext_close(textToSplit);
+    }
+
+
    //
    // RegexPattern::pattern() and patternText()
    //
@ -3079,7 +3141,7 @@ void RegexTest::API_Pattern_UTF8() {
    regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
    pat1 = RegexPattern::compile(&re1, pe, status);
    REGEX_CHECK_STATUS;
-    REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
+    REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
    REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
    delete pat1;

@ -4995,7 +5057,11 @@ void RegexTest::PreAllocatedUTextCAPI () {
        UChar    text1[80];
        UText   *actual;
        UBool    result;
-        u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
+        int64_t  length = 0;
+
+        u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
+        //                  012345678901234567890123456789012345678901234567
+        //                  0         1         2         3         4

        status = U_ZERO_ERROR;
        re = uregex_openC("abc(.*?)def", 0, NULL, &status);
@ -5005,26 +5071,29 @@ void RegexTest::PreAllocatedUTextCAPI () {
        result = uregex_find(re, 0, &status);
        REGEX_ASSERT(result==TRUE);

-        /*  Capture Group 0, the full match.  Should succeed.  */
+        /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
        status = U_ZERO_ERROR;
-        actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
+        actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
        REGEX_CHECK_STATUS;
        REGEX_ASSERT(actual == &bufferText);
-        REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
+        REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
+        REGEX_ASSERT(length == 16);
+        REGEX_ASSERT(utext_nativeLength(actual) == 47);

-        /*  Capture group #1.  Should succeed. */
+        /*  Capture group #1.  Should succeed, matching " interior ". */
        status = U_ZERO_ERROR;
-        actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
+        actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
        REGEX_CHECK_STATUS;
        REGEX_ASSERT(actual == &bufferText);
-        REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
+        REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
+        REGEX_ASSERT(length == 10);
+        REGEX_ASSERT(utext_nativeLength(actual) == 47);

        /*  Capture group out of range.  Error. */
        status = U_ZERO_ERROR;
-        actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
+        actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
        REGEX_ASSERT(actual == &bufferText);
-
        uregex_close(re);

    }
@ -5037,10 +5106,12 @@ void RegexTest::PreAllocatedUTextCAPI () {
        UChar    text2[80];
        UText    replText = UTEXT_INITIALIZER;
        UText   *result;
+        status = U_ZERO_ERROR;
+        utext_openUnicodeString(&bufferText, &buffer, &status);

        status = U_ZERO_ERROR;
-        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
-        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
+        u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
+        u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);

        re = uregex_openC("x(.*?)x", 0, NULL, &status);
@ -5048,7 +5119,9 @@ void RegexTest::PreAllocatedUTextCAPI () {

        /*  Normal case, with match */
        uregex_setText(re, text1, -1, &status);
+        REGEX_CHECK_STATUS;
        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
+        REGEX_CHECK_STATUS;
        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
        REGEX_CHECK_STATUS;
        REGEX_ASSERT(result == &bufferText);