ICU-6677 add u_strToUTF32WithSub() and u_strFromUTF32WithSub()

X-SVN-Rev: 25444
2025-04-07 06:25:30 +00:00 · 2009-02-19 20:02:40 +00:00 · 2009-02-19 20:02:40 +00:00 · c4e1d3e0be
commit c4e1d3e0be
parent a679ccf60c
3 changed files with 309 additions and 71 deletions
--- a/icu4c/source/common/unicode/ustring.h
+++ b/icu4c/source/common/unicode/ustring.h
@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 1998-2008, International Business Machines
+*   Copyright (C) 1998-2009, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *
@ -1183,7 +1183,10 @@ u_strFoldCase(UChar *dest, int32_t destCapacity,

 #if defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION
 /**
- * Converts a sequence of UChars to wchar_t units.
+ * Convert a UTF-16 string to a wchar_t string.
+ * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
+ * this function simply calls the fast, dedicated function for that.
+ * Otherwise, two conversions UTF-16 -> default charset -> wchar_t* are performed.
 *
 * @param dest          A buffer for the result string. The result will be zero-terminated if
 *                      the buffer is large enough.
@ -1209,7 +1212,10 @@ u_strToWCS(wchar_t *dest,
           int32_t srcLength,
           UErrorCode *pErrorCode);
 /**
- * Converts a sequence of wchar_t units to UChars
+ * Convert a wchar_t string to UTF-16.
+ * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
+ * this function simply calls the fast, dedicated function for that.
+ * Otherwise, two conversions wchar_t* -> default charset -> UTF-16 are performed.
 *
 * @param dest          A buffer for the result string. The result will be zero-terminated if
 *                      the buffer is large enough.
@ -1237,7 +1243,8 @@ u_strFromWCS(UChar   *dest,
 #endif /* defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION */

 /**
- * Converts a sequence of UChars (UTF-16) to UTF-8 bytes
+ * Convert a UTF-16 string to UTF-8.
+ * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
 *
 * @param dest          A buffer for the result string. The result will be zero-terminated if
 *                      the buffer is large enough.
@ -1266,7 +1273,8 @@ u_strToUTF8(char *dest,
            UErrorCode *pErrorCode);

 /**
- * Converts a sequence of UTF-8 bytes to UChars (UTF-16).
+ * Convert a UTF-8 string to UTF-16.
+ * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
 *
 * @param dest          A buffer for the result string. The result will be zero-terminated if
 *                      the buffer is large enough.
@ -1295,7 +1303,9 @@ u_strFromUTF8(UChar *dest,
              UErrorCode *pErrorCode);

 /**
- * Converts a sequence of UChars (UTF-16) to UTF-8 bytes.
+ * Convert a UTF-16 string to UTF-8.
+ * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
+ *
 * Same as u_strToUTF8() except for the additional subchar which is output for
 * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
 * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8().
@ -1338,7 +1348,9 @@ u_strToUTF8WithSub(char *dest,
            UErrorCode *pErrorCode);

 /**
- * Converts a sequence of UTF-8 bytes to UChars (UTF-16).
+ * Convert a UTF-8 string to UTF-16.
+ * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
+ *
 * Same as u_strFromUTF8() except for the additional subchar which is output for
 * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
 * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8().
@ -1382,7 +1394,8 @@ u_strFromUTF8WithSub(UChar *dest,
              UErrorCode *pErrorCode);

 /**
- * Converts a sequence of UTF-8 bytes to UChars (UTF-16).
+ * Convert a UTF-8 string to UTF-16.
+ *
 * Same as u_strFromUTF8() except that this function is designed to be very fast,
 * which it achieves by being lenient about malformed UTF-8 sequences.
 * This function is intended for use in environments where UTF-8 text is
@ -1401,6 +1414,9 @@ u_strFromUTF8WithSub(UChar *dest,
 * For further performance improvement, if srcLength is given (>=0),
 * then it must be destCapacity>=srcLength.
 *
+ * There is no inverse u_strToUTF8Lenient() function because there is practically
+ * no performance gain from not checking that a UTF-16 string is well-formed.
+ *
 * @param dest          A buffer for the result string. The result will be zero-terminated if
 *                      the buffer is large enough.
 * @param destCapacity  The size of the buffer (number of UChars). If it is 0, then
@ -1437,7 +1453,8 @@ u_strFromUTF8Lenient(UChar *dest,
                     UErrorCode *pErrorCode);

 /**
- * Converts a sequence of UChars (UTF-16) to UTF32 units.
+ * Convert a UTF-16 string to UTF-32.
+ * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
 *
 * @param dest          A buffer for the result string. The result will be zero-terminated if
 *                      the buffer is large enough.
@ -1453,6 +1470,8 @@ u_strFromUTF8Lenient(UChar *dest,
 * @param pErrorCode    Must be a valid pointer to an error code value,
 *                      which must not indicate a failure before the function call.
 * @return The pointer to destination buffer.
+ * @see u_strToUTF32WithSub
+ * @see u_strFromUTF32
 * @stable ICU 2.0
 */
 U_STABLE UChar32* U_EXPORT2 
@ -1464,7 +1483,8 @@ u_strToUTF32(UChar32 *dest,
             UErrorCode *pErrorCode);

 /**
- * Converts a sequence of UTF32 units to UChars (UTF-16)
+ * Convert a UTF-32 string to UTF-16.
+ * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
 *
 * @param dest          A buffer for the result string. The result will be zero-terminated if
 *                      the buffer is large enough.
@ -1480,6 +1500,8 @@ u_strToUTF32(UChar32 *dest,
 * @param pErrorCode    Must be a valid pointer to an error code value,
 *                      which must not indicate a failure before the function call.
 * @return The pointer to destination buffer.
+ * @see u_strFromUTF32WithSub
+ * @see u_strToUTF32
 * @stable ICU 2.0
 */
 U_STABLE UChar* U_EXPORT2 
@ -1490,4 +1512,94 @@ u_strFromUTF32(UChar   *dest,
               int32_t srcLength,
               UErrorCode *pErrorCode);

+/**
+ * Convert a UTF-16 string to UTF-32.
+ * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
+ *
+ * Same as u_strToUTF32() except for the additional subchar which is output for
+ * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
+ * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF32().
+ *
+ * @param dest          A buffer for the result string. The result will be zero-terminated if
+ *                      the buffer is large enough.
+ * @param destCapacity  The size of the buffer (number of UChar32s). If it is 0, then
+ *                      dest may be NULL and the function will only return the length of the
+ *                      result without writing any of the result string (pre-flighting).
+ * @param pDestLength   A pointer to receive the number of units written to the destination. If
+ *                      pDestLength!=NULL then *pDestLength is always set to the
+ *                      number of output units corresponding to the transformation of
+ *                      all the input units, even in case of a buffer overflow.
+ * @param src           The original source string
+ * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
+ * @param subchar       The substitution character to use in place of an illegal input sequence,
+ *                      or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
+ *                      A substitution character can be any valid Unicode code point (up to U+10FFFF)
+ *                      except for surrogate code points (U+D800..U+DFFF).
+ *                      The recommended value is U+FFFD "REPLACEMENT CHARACTER".
+ * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
+ *                      Set to 0 if no substitutions occur or subchar<0.
+ *                      pNumSubstitutions can be NULL.
+ * @param pErrorCode    Pointer to a standard ICU error code. Its input value must
+ *                      pass the U_SUCCESS() test, or else the function returns
+ *                      immediately. Check for U_FAILURE() on output or use with
+ *                      function chaining. (See User Guide for details.)
+ * @return The pointer to destination buffer.
+ * @see u_strToUTF32
+ * @see u_strFromUTF32WithSub
+ * @draft ICU 4.2
+ */
+U_DRAFT UChar32* U_EXPORT2
+u_strToUTF32WithSub(UChar32 *dest,
+             int32_t destCapacity,
+             int32_t *pDestLength,
+             const UChar *src,
+             int32_t srcLength,
+             UChar32 subchar, int32_t *pNumSubstitutions,
+             UErrorCode *pErrorCode);
+
+/**
+ * Convert a UTF-32 string to UTF-16.
+ * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
+ *
+ * Same as u_strFromUTF32() except for the additional subchar which is output for
+ * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
+ * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF32().
+ *
+ * @param dest          A buffer for the result string. The result will be zero-terminated if
+ *                      the buffer is large enough.
+ * @param destCapacity  The size of the buffer (number of UChars). If it is 0, then
+ *                      dest may be NULL and the function will only return the length of the
+ *                      result without writing any of the result string (pre-flighting).
+ * @param pDestLength   A pointer to receive the number of units written to the destination. If
+ *                      pDestLength!=NULL then *pDestLength is always set to the
+ *                      number of output units corresponding to the transformation of
+ *                      all the input units, even in case of a buffer overflow.
+ * @param src           The original source string
+ * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
+ * @param subchar       The substitution character to use in place of an illegal input sequence,
+ *                      or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
+ *                      A substitution character can be any valid Unicode code point (up to U+10FFFF)
+ *                      except for surrogate code points (U+D800..U+DFFF).
+ *                      The recommended value is U+FFFD "REPLACEMENT CHARACTER".
+ * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
+ *                      Set to 0 if no substitutions occur or subchar<0.
+ *                      pNumSubstitutions can be NULL.
+ * @param pErrorCode    Pointer to a standard ICU error code. Its input value must
+ *                      pass the U_SUCCESS() test, or else the function returns
+ *                      immediately. Check for U_FAILURE() on output or use with
+ *                      function chaining. (See User Guide for details.)
+ * @return The pointer to destination buffer.
+ * @see u_strFromUTF32
+ * @see u_strToUTF32WithSub
+ * @draft ICU 4.2
+ */
+U_DRAFT UChar* U_EXPORT2
+u_strFromUTF32WithSub(UChar *dest,
+               int32_t destCapacity,
+               int32_t *pDestLength,
+               const UChar32 *src,
+               int32_t srcLength,
+               UChar32 subchar, int32_t *pNumSubstitutions,
+               UErrorCode *pErrorCode);
+
 #endif
--- a/icu4c/source/common/ustrtrns.c
+++ b/icu4c/source/common/ustrtrns.c
@ -31,38 +31,50 @@
 #include "ustr_imp.h"

 U_CAPI UChar* U_EXPORT2 
-u_strFromUTF32(UChar *dest,
-               int32_t destCapacity, 
+u_strFromUTF32WithSub(UChar *dest,
+               int32_t destCapacity,
               int32_t *pDestLength,
               const UChar32 *src,
               int32_t srcLength,
+               UChar32 subchar, int32_t *pNumSubstitutions,
               UErrorCode *pErrorCode) {
    const UChar32 *srcLimit;
    UChar32 ch;
    UChar *destLimit;
    UChar *pDest;
    int32_t reqLength;
+    int32_t numSubstitutions;

    /* args check */
    if(U_FAILURE(*pErrorCode)){
        return NULL;
    }
-    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)) {
+    if( (src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) ||
+        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
+    ) {
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

+    if(pNumSubstitutions != NULL) {
+        *pNumSubstitutions = 0;
+    }
+
    pDest = dest;
    destLimit = dest + destCapacity;
    reqLength = 0;
+    numSubstitutions = 0;

    if(srcLength < 0) {
        /* simple loop for conversion of a NUL-terminated BMP string */
        while((ch=*src) != 0 &&
-              ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) &&
-              pDest < destLimit) {
+              ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
            ++src;
-            *pDest++ = (UChar)ch;
+            if(pDest < destLimit) {
+                *pDest++ = (UChar)ch;
+            } else {
+                ++reqLength;
+            }
        }
        srcLimit = src;
        if(ch != 0) {
@ -74,43 +86,42 @@ u_strFromUTF32(UChar *dest,
    }

    /* convert with length */
-    while(src < srcLimit && pDest < destLimit) {
-        ch = *src++;
-        if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
-            *pDest++ = (UChar)ch;
-        } else if(0x10000 <= ch && ch <= 0x10ffff) {
-            *pDest++ = U16_LEAD(ch);
-            if(pDest < destLimit) {
-                *pDest++ = U16_TRAIL(ch);
-            } else {
-                reqLength = 1;
-                break;
-            }
-        } else {
-            /* surrogate code point, or not a Unicode code point at all */
-            *pErrorCode = U_INVALID_CHAR_FOUND;
-            return NULL;
-        }
-    }
-
-    /* preflight the remaining string */
    while(src < srcLimit) {
        ch = *src++;
-        if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
-            ++reqLength;
-        } else if(0x10000 <= ch && ch <= 0x10ffff) {
-            reqLength += 2;
-        } else {
-            /* surrogate code point, or not a Unicode code point at all */
-            *pErrorCode = U_INVALID_CHAR_FOUND;
-            return NULL;
-        }
+        do {
+            /* usually "loops" once; twice only for writing subchar */
+            if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
+                if(pDest < destLimit) {
+                    *pDest++ = (UChar)ch;
+                } else {
+                    ++reqLength;
+                }
+                break;
+            } else if(0x10000 <= ch && ch <= 0x10ffff) {
+                if((pDest + 2) <= destLimit) {
+                    *pDest++ = U16_LEAD(ch);
+                    *pDest++ = U16_TRAIL(ch);
+                } else {
+                    reqLength += 2;
+                }
+                break;
+            } else if((ch = subchar) < 0) {
+                /* surrogate code point, or not a Unicode code point at all */
+                *pErrorCode = U_INVALID_CHAR_FOUND;
+                return NULL;
+            } else {
+                ++numSubstitutions;
+            }
+        } while(TRUE);
    }

    reqLength += (int32_t)(pDest - dest);
    if(pDestLength) {
        *pDestLength = reqLength;
    }
+    if(pNumSubstitutions != NULL) {
+        *pNumSubstitutions = numSubstitutions;
+    }

    /* Terminate the buffer */
    u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
@ -118,13 +129,27 @@ u_strFromUTF32(UChar *dest,
    return dest;
 }

+U_CAPI UChar* U_EXPORT2 
+u_strFromUTF32(UChar *dest,
+               int32_t destCapacity, 
+               int32_t *pDestLength,
+               const UChar32 *src,
+               int32_t srcLength,
+               UErrorCode *pErrorCode) {
+    return u_strFromUTF32WithSub(
+            dest, destCapacity, pDestLength,
+            src, srcLength,
+            U_SENTINEL, NULL,
+            pErrorCode);
+}

 U_CAPI UChar32* U_EXPORT2 
-u_strToUTF32(UChar32 *dest, 
+u_strToUTF32WithSub(UChar32 *dest,
             int32_t destCapacity,
             int32_t *pDestLength,
-             const UChar *src, 
+             const UChar *src,
             int32_t srcLength,
+             UChar32 subchar, int32_t *pNumSubstitutions,
             UErrorCode *pErrorCode) {
    const UChar *srcLimit;
    UChar32 ch;
@ -132,25 +157,37 @@ u_strToUTF32(UChar32 *dest,
    UChar32 *destLimit;
    UChar32 *pDest;
    int32_t reqLength;
+    int32_t numSubstitutions;

    /* args check */
    if(U_FAILURE(*pErrorCode)){
        return NULL;
    }
-    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)) {
+    if( (src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) ||
+        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
+    ) {
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

+    if(pNumSubstitutions != NULL) {
+        *pNumSubstitutions = 0;
+    }
+
    pDest = dest;
    destLimit = dest + destCapacity;
    reqLength = 0;
+    numSubstitutions = 0;

    if(srcLength < 0) {
        /* simple loop for conversion of a NUL-terminated BMP string */
-        while((ch=*src) != 0 && !U16_IS_SURROGATE(ch) && pDest < destLimit) {
+        while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
            ++src;
-            *pDest++ = ch;
+            if(pDest < destLimit) {
+                *pDest++ = ch;
+            } else {
+                ++reqLength;
+            }
        }
        srcLimit = src;
        if(ch != 0) {
@ -162,47 +199,55 @@ u_strToUTF32(UChar32 *dest,
    }

    /* convert with length */
-    while(src < srcLimit && pDest < destLimit) {
-        ch = *src++;
-        if(!U16_IS_SURROGATE(ch)) {
-            /* write ch below */
-        } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
-            ++src;
-            ch = U16_GET_SUPPLEMENTARY(ch, ch2);
-        } else {
-            /* unpaired surrogate */
-            *pErrorCode = U_INVALID_CHAR_FOUND;
-            return NULL;
-        }
-        *pDest++ = ch;
-    }
-
-    /* preflight the remaining string */
    while(src < srcLimit) {
        ch = *src++;
        if(!U16_IS_SURROGATE(ch)) {
-            /* ++reqLength below */
-        } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(*src)) {
+            /* write or count ch below */
+        } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
            ++src;
-        } else {
+            ch = U16_GET_SUPPLEMENTARY(ch, ch2);
+        } else if((ch = subchar) < 0) {
            /* unpaired surrogate */
            *pErrorCode = U_INVALID_CHAR_FOUND;
            return NULL;
+        } else {
+            ++numSubstitutions;
+        }
+        if(pDest < destLimit) {
+            *pDest++ = ch;
+        } else {
+            ++reqLength;
        }
-        ++reqLength;
    }

    reqLength += (int32_t)(pDest - dest);
    if(pDestLength) {
        *pDestLength = reqLength;
    }
+    if(pNumSubstitutions != NULL) {
+        *pNumSubstitutions = numSubstitutions;
+    }

    /* Terminate the buffer */
    u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
-    
+
    return dest;
 }

+U_CAPI UChar32* U_EXPORT2 
+u_strToUTF32(UChar32 *dest, 
+             int32_t destCapacity,
+             int32_t *pDestLength,
+             const UChar *src, 
+             int32_t srcLength,
+             UErrorCode *pErrorCode) {
+    return u_strToUTF32WithSub(
+            dest, destCapacity, pDestLength,
+            src, srcLength,
+            U_SENTINEL, NULL,
+            pErrorCode);
+}
+
 /* for utf8_nextCharSafeBodyTerminated() */
 static const UChar32
 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
@ -372,6 +417,9 @@ u_strFromUTF8WithSub(UChar *dest,
        return NULL;
    }

+    if(pNumSubstitutions!=NULL) {
+        *pNumSubstitutions=0;
+    }
    numSubstitutions=0;

    /*
@ -948,6 +996,9 @@ u_strToUTF8WithSub(char *dest,
        return NULL;
    }

+    if(pNumSubstitutions!=NULL) {
+        *pNumSubstitutions=0;
+    }
    numSubstitutions=0;

    if(srcLength==-1) {
--- a/icu4c/source/test/cintltst/custrtrn.c
+++ b/icu4c/source/test/cintltst/custrtrn.c
@ -202,10 +202,13 @@ static void Test_strToUTF32_surrogates() {
    UErrorCode err = U_ZERO_ERROR;
    UChar32 u32Target[400];
    int32_t len16, u32DestLen;
+    int32_t numSubstitutions;
    int i;

    static const UChar surr16[] = { 0x41, 0xd900, 0x61, 0xdc00, 0x5a, 0xd900, 0xdc00, 0x7a, 0 };
    static const UChar32 expected[] = { 0x5a, 0x50000, 0x7a, 0 };
+    static const UChar32 expected_FFFD[] = { 0x41, 0xfffd, 0x61, 0xfffd, 0x5a, 0x50000, 0x7a, 0 };
+    static const UChar32 expected_12345[] = { 0x41, 0x12345, 0x61, 0x12345, 0x5a, 0x50000, 0x7a, 0 };
    len16 = LENGTHOF(surr16);
    for(i = 0; i < 4; ++i) {
        err = U_ZERO_ERROR;
@ -272,6 +275,40 @@ static void Test_strToUTF32_surrogates() {
                u_errorName(err));
        return;
    }
+
+    /* with substitution character */
+    numSubstitutions = -1;
+    err = U_ZERO_ERROR;
+    u_strToUTF32WithSub(u32Target, 0, &u32DestLen, surr16, len16-1, 0xfffd, &numSubstitutions, &err);
+    if(err != U_BUFFER_OVERFLOW_ERROR || u32DestLen != 7 || numSubstitutions != 2) {
+        log_err("u_strToUTF32WithSub(preflight surr16) sets %s != U_BUFFER_OVERFLOW_ERROR or an unexpected length\n",
+                u_errorName(err));
+        return;
+    }
+
+    err = U_ZERO_ERROR;
+    u_strToUTF32WithSub(u32Target, LENGTHOF(u32Target), &u32DestLen, surr16, len16-1, 0xfffd, &numSubstitutions, &err);
+    if(err != U_ZERO_ERROR || u32DestLen != 7 || numSubstitutions != 2 || uprv_memcmp(u32Target, expected_FFFD, 8*4)) {
+        log_err("u_strToUTF32WithSub(surr16) sets %s != U_ZERO_ERROR or does not produce the expected string\n",
+                u_errorName(err));
+        return;
+    }
+
+    err = U_ZERO_ERROR;
+    u_strToUTF32WithSub(NULL, 0, &u32DestLen, surr16, -1, 0x12345, &numSubstitutions, &err);
+    if(err != U_BUFFER_OVERFLOW_ERROR || u32DestLen != 7 || numSubstitutions != 2) {
+        log_err("u_strToUTF32WithSub(preflight surr16/NUL) sets %s != U_BUFFER_OVERFLOW_ERROR or an unexpected length\n",
+                u_errorName(err));
+        return;
+    }
+
+    err = U_ZERO_ERROR;
+    u_strToUTF32WithSub(u32Target, LENGTHOF(u32Target), &u32DestLen, surr16, -1, 0x12345, &numSubstitutions, &err);
+    if(err != U_ZERO_ERROR || u32DestLen != 7 || numSubstitutions != 2 || uprv_memcmp(u32Target, expected_12345, 8*4)) {
+        log_err("u_strToUTF32WithSub(surr16/NUL) sets %s != U_ZERO_ERROR or does not produce the expected string\n",
+                u_errorName(err));
+        return;
+    }
 }

 static void Test_strFromUTF32(void){
@ -345,10 +382,14 @@ static void Test_strFromUTF32_surrogates() {
    UErrorCode err = U_ZERO_ERROR;
    UChar uTarget[400];
    int32_t len32, uDestLen;
+    int32_t numSubstitutions;
    int i;

    static const UChar32 surr32[] = { 0x41, 0xd900, 0x61, 0xdc00, -1, 0x110000, 0x5a, 0x50000, 0x7a, 0 };
    static const UChar expected[] = { 0x5a, 0xd900, 0xdc00, 0x7a, 0 };
+    static const UChar expected_FFFD[] = { 0x41, 0xfffd, 0x61, 0xfffd, 0xfffd, 0xfffd, 0x5a, 0xd900, 0xdc00, 0x7a, 0 };
+    static const UChar expected_12345[] = { 0x41, 0xd808, 0xdf45, 0x61, 0xd808, 0xdf45, 0xd808, 0xdf45, 0xd808, 0xdf45,
+                                            0x5a, 0xd900, 0xdc00, 0x7a, 0 };
    len32 = LENGTHOF(surr32);
    for(i = 0; i < 6; ++i) {
        err = U_ZERO_ERROR;
@ -415,6 +456,40 @@ static void Test_strFromUTF32_surrogates() {
                u_errorName(err));
        return;
    }
+
+    /* with substitution character */
+    numSubstitutions = -1;
+    err = U_ZERO_ERROR;
+    u_strFromUTF32WithSub(uTarget, 0, &uDestLen, surr32, len32-1, 0xfffd, &numSubstitutions, &err);
+    if(err != U_BUFFER_OVERFLOW_ERROR || uDestLen != 10 || numSubstitutions != 4) {
+        log_err("u_strFromUTF32WithSub(preflight surr32) sets %s != U_BUFFER_OVERFLOW_ERROR or an unexpected length\n",
+                u_errorName(err));
+        return;
+    }
+
+    err = U_ZERO_ERROR;
+    u_strFromUTF32WithSub(uTarget, LENGTHOF(uTarget), &uDestLen, surr32, len32-1, 0xfffd, &numSubstitutions, &err);
+    if(err != U_ZERO_ERROR || uDestLen != 10 || numSubstitutions != 4 || u_memcmp(uTarget, expected_FFFD, 11)) {
+        log_err("u_strFromUTF32WithSub(surr32) sets %s != U_ZERO_ERROR or does not produce the expected string\n",
+                u_errorName(err));
+        return;
+    }
+
+    err = U_ZERO_ERROR;
+    u_strFromUTF32WithSub(NULL, 0, &uDestLen, surr32, -1, 0x12345, &numSubstitutions, &err);
+    if(err != U_BUFFER_OVERFLOW_ERROR || uDestLen != 14 || numSubstitutions != 4) {
+        log_err("u_strFromUTF32WithSub(preflight surr32/NUL) sets %s != U_BUFFER_OVERFLOW_ERROR or an unexpected length\n",
+                u_errorName(err));
+        return;
+    }
+
+    err = U_ZERO_ERROR;
+    u_strFromUTF32WithSub(uTarget, LENGTHOF(uTarget), &uDestLen, surr32, -1, 0x12345, &numSubstitutions, &err);
+    if(err != U_ZERO_ERROR || uDestLen != 14 || numSubstitutions != 4 || u_memcmp(uTarget, expected_12345, 15)) {
+        log_err("u_strFromUTF32WithSub(surr32/NUL) sets %s != U_ZERO_ERROR or does not produce the expected string\n",
+                u_errorName(err));
+        return;
+    }
 }

 static void Test_UChar_UTF8_API(void){