ICU-5188 add ucnv_setSubstString()

X-SVN-Rev: 19789
This commit is contained in:
Markus Scherer 2006-07-05 23:08:50 +00:00
parent 820ca02a9f
commit 8e5f03bcd1
13 changed files with 394 additions and 34 deletions

View file

@ -249,6 +249,19 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
uprv_memcpy(localConverter, cnv, sizeof(UConverter));
localConverter->isCopyLocal = localConverter->isExtraLocal = FALSE;
/* copy the substitution string */
if (cnv->subChars == (uint8_t *)cnv->subUChars) {
localConverter->subChars = (uint8_t *)localConverter->subUChars;
} else {
localConverter->subChars = (uint8_t *)uprv_malloc(UCNV_ERROR_BUFFER_LENGTH * U_SIZEOF_UCHAR);
if (localConverter->subChars == NULL) {
uprv_free(allocatedConverter);
UTRACE_EXIT_STATUS(*status);
return NULL;
}
uprv_memcpy(localConverter->subChars, cnv->subChars, UCNV_ERROR_BUFFER_LENGTH * U_SIZEOF_UCHAR);
}
/* now either call the safeclone fcn or not */
if (cnv->sharedData->impl->safeClone != NULL) {
/* call the custom safeClone function */
@ -256,6 +269,9 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
}
if(localConverter==NULL || U_FAILURE(*status)) {
if (allocatedConverter != NULL && allocatedConverter->subChars != (uint8_t *)allocatedConverter->subUChars) {
uprv_free(allocatedConverter->subChars);
}
uprv_free(allocatedConverter);
UTRACE_EXIT_STATUS(*status);
return NULL;
@ -348,6 +364,10 @@ ucnv_close (UConverter * converter)
converter->sharedData->impl->close(converter);
}
if (converter->subChars != (uint8_t *)converter->subUChars) {
uprv_free(converter->subChars);
}
/*
Checking whether it's an algorithic converter is okay
in multithreaded applications because the value never changes.
@ -395,15 +415,19 @@ ucnv_getSubstChars (const UConverter * converter,
if (U_FAILURE (*err))
return;
if (converter->subCharLen <= 0) {
/* Unicode string or empty string from ucnv_setSubstString(). */
*len = 0;
return;
}
if (*len < converter->subCharLen) /*not enough space in subChars */
{
*err = U_INDEX_OUTOFBOUNDS_ERROR;
return;
}
uprv_memcpy (mySubChar, converter->subChar, converter->subCharLen); /*fills in the subchars */
*len = converter->subCharLen; /*store # of bytes copied to buffer */
uprv_memcpy (mySubChar, converter->subChar, converter->subCharLen); /*fills in the subchars */
uprv_memcpy (mySubChar, converter->subChars, converter->subCharLen); /*fills in the subchars */
*len = converter->subCharLen; /*store # of bytes copied to buffer */
}
@ -424,7 +448,7 @@ ucnv_setSubstChars (UConverter * converter,
return;
}
uprv_memcpy (converter->subChar, mySubChar, len); /*copies the subchars */
uprv_memcpy (converter->subChars, mySubChar, len); /*copies the subchars */
converter->subCharLen = len; /*sets the new len */
/*
@ -437,6 +461,91 @@ ucnv_setSubstChars (UConverter * converter,
return;
}
U_DRAFT void U_EXPORT2
ucnv_setSubstString(UConverter *cnv,
const UChar *s,
int32_t length,
UErrorCode *err) {
UAlignedMemory cloneBuffer[U_CNV_SAFECLONE_BUFFERSIZE / sizeof(UAlignedMemory) + 1];
char chars[UCNV_ERROR_BUFFER_LENGTH];
UConverter *clone;
uint8_t *subChars;
int32_t cloneSize, length8;
/* Let the following functions check all arguments. */
cloneSize = sizeof(cloneBuffer);
clone = ucnv_safeClone(cnv, cloneBuffer, &cloneSize, err);
ucnv_setFromUCallBack(clone, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, err);
length8 = ucnv_fromUChars(clone, chars, (int32_t)sizeof(chars), s, length, err);
ucnv_close(clone);
if (U_FAILURE(*err)) {
return;
}
if (cnv->sharedData->impl->writeSub == NULL ||
(cnv->sharedData->staticData->conversionType == UCNV_MBCS &&
ucnv_MBCSGetType(cnv) != UCNV_EBCDIC_STATEFUL)
) {
/* The converter is not stateful. Store the charset bytes as a fixed string. */
subChars = (uint8_t *)chars;
} else {
/*
* The converter has a non-default writeSub() function, indicating
* that it is stateful.
* Store the Unicode string for on-the-fly conversion for correct
* state handling.
*/
if (length > UCNV_ERROR_BUFFER_LENGTH) {
/*
* Should not occur. The converter should output at least one byte
* per UChar, which means that ucnv_fromUChars() should catch all
* overflows.
*/
*err = U_BUFFER_OVERFLOW_ERROR;
return;
}
subChars = (uint8_t *)s;
if (length < 0) {
length = u_strlen(s);
}
length8 = length * U_SIZEOF_UCHAR;
}
/*
* For storing the substitution string, select either the small buffer inside
* UConverter or allocate a subChars buffer.
*/
if (length8 > UCNV_MAX_SUBCHAR_LEN) {
/* Use a separate buffer for the string. Outside UConverter to not make it too large. */
if (cnv->subChars == (uint8_t *)cnv->subUChars) {
/* Allocate a new buffer for the string. */
cnv->subChars = (uint8_t *)uprv_malloc(UCNV_ERROR_BUFFER_LENGTH * U_SIZEOF_UCHAR);
if (cnv->subChars == NULL) {
cnv->subChars = (uint8_t *)cnv->subUChars;
*err = U_MEMORY_ALLOCATION_ERROR;
return;
}
uprv_memset(cnv->subChars, 0, UCNV_ERROR_BUFFER_LENGTH * U_SIZEOF_UCHAR);
}
}
/* Copy the substitution string into the UConverter or its subChars buffer. */
if (length8 == 0) {
cnv->subCharLen = 0;
} else {
uprv_memcpy(cnv->subChars, subChars, length8);
if (subChars == (uint8_t *)chars) {
cnv->subCharLen = (int8_t)length8;
} else /* subChars == s */ {
cnv->subCharLen = (int8_t)-length;
}
}
/* See comment in ucnv_setSubstChars(). */
cnv->subChar1 = 0;
}
/*resets the internal states of a converter
*goal : have the same behaviour than a freshly created converter
*/

View file

@ -470,7 +470,7 @@ _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t opti
}
uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
uprv_memcpy(cnv->subChar, myConverterData->currentConverter->subChar, 4);
uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
cnv->subCharLen = myConverterData->currentConverter->subCharLen;
}else{
myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
@ -2869,7 +2869,7 @@ _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorC
char buffer[8];
int32_t length;
subchar=(char *)cnv->subChar;
subchar=(char *)cnv->subChars;
length=cnv->subCharLen; /* assume length==1 for most variants */
p = buffer;
@ -2924,14 +2924,24 @@ _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorC
}
break;
} else {
/* let the subconverter write the subchar */
args->converter = myConverterData->currentConverter;
uprv_memcpy(myConverterData->currentConverter->subChar, subchar, 4);
/* save the subconverter's substitution string */
uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
/* set our substitution string into the subconverter */
myConverterData->currentConverter->subChars = (uint8_t *)subchar;
myConverterData->currentConverter->subCharLen = (int8_t)length;
/* let the subconverter write the subchar, set/retrieve fromUChar32 state */
args->converter = myConverterData->currentConverter;
myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
ucnv_cbFromUWriteSub(args, 0, err);
cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
args->converter = cnv;
/* restore the subconverter's substitution string */
myConverterData->currentConverter->subChars = currentSubChars;
myConverterData->currentConverter->subCharLen = currentSubCharLen;
if(*err == U_BUFFER_OVERFLOW_ERROR) {
if(myConverterData->currentConverter->charErrorBufferLength > 0) {
@ -2943,7 +2953,6 @@ _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorC
cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
myConverterData->currentConverter->charErrorBufferLength = 0;
}
args->converter = cnv;
return;
}
default:

View file

@ -912,7 +912,8 @@ ucnv_createConverterFromSharedData(UConverter *myUConverter,
myUConverter->maxBytesPerUChar = mySharedConverterData->staticData->maxBytesPerChar;
myUConverter->subChar1 = mySharedConverterData->staticData->subChar1;
myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen;
uprv_memcpy(myUConverter->subChar, mySharedConverterData->staticData->subChar, myUConverter->subCharLen);
myUConverter->subChars = (uint8_t *)myUConverter->subUChars;
uprv_memcpy(myUConverter->subChars, mySharedConverterData->staticData->subChar, myUConverter->subCharLen);
if(mySharedConverterData->impl->open != NULL) {
mySharedConverterData->impl->open(myUConverter, realName, locale, options, err);

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2005, International Business Machines
* Copyright (C) 1999-2006, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@ -157,6 +157,17 @@ struct UConverter {
const void *fromUContext;
const void *toUContext;
/*
* Pointer to charset bytes for substitution string if subCharLen>0,
* or pointer to Unicode string (UChar *) if subCharLen<0.
* subCharLen==0 is equivalent to using a skip callback.
* If the pointer is !=subUChars then it is allocated with
* UCNV_ERROR_BUFFER_LENGTH * U_SIZEOF_UCHAR bytes.
* The subUChars field is declared as UChar[] not uint8_t[] to
* guarantee alignment for UChars.
*/
uint8_t *subChars;
UConverterSharedData *sharedData; /* Pointer to the shared immutable part of the converter object */
uint32_t options; /* options flags from UConverterOpen, may contain additional bits */
@ -200,9 +211,9 @@ struct UConverter {
uint8_t subChar1; /* single-byte substitution character if different from subChar */
UBool useSubChar1;
uint8_t subChar[UCNV_MAX_SUBCHAR_LEN]; /* codepage specific character sequence */
char invalidCharBuffer[UCNV_MAX_CHAR_LEN]; /* bytes from last error/callback situation */
uint8_t charErrorBuffer[UCNV_ERROR_BUFFER_LENGTH]; /* codepage output from Error functions */
UChar subUChars[UCNV_MAX_SUBCHAR_LEN/U_SIZEOF_UCHAR]; /* see subChars documentation */
UChar invalidUCharBuffer[U16_MAX_LENGTH]; /* UChars from last error/callback situation */
UChar UCharErrorBuffer[UCNV_ERROR_BUFFER_LENGTH]; /* unicode output from Error functions */

View file

@ -177,10 +177,31 @@ ucnv_cbFromUWriteSub (UConverterFromUnicodeArgs *args,
UErrorCode * err)
{
UConverter *converter;
int32_t length;
if(U_FAILURE(*err)) {
return;
}
converter = args->converter;
length = converter->subCharLen;
if(length == 0) {
return;
}
if(length < 0) {
/*
* Write/convert the substitution string. Its real length is -length.
* Unlike the escape callback, we need not change the converter's
* callback function because ucnv_setSubstString() verified that
* the string can be converted, so we will not get a conversion error
* and will not recurse.
* At worst we should get a U_BUFFER_OVERFLOW_ERROR.
*/
const UChar *source = (const UChar *)converter->subChars;
ucnv_cbFromUWriteUChars(args, &source, source - length, offsetIndex, err);
return;
}
if(converter->sharedData->impl->writeSub!=NULL) {
converter->sharedData->impl->writeSub(args, offsetIndex, err);
@ -196,7 +217,7 @@ ucnv_cbFromUWriteSub (UConverterFromUnicodeArgs *args,
}
else {
ucnv_cbFromUWriteBytes(args,
(const char *)converter->subChar, converter->subCharLen,
(const char *)converter->subChars, length,
offsetIndex, err);
}
}

View file

@ -460,7 +460,7 @@ _HZ_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *e
*p++= UCNV_CLOSE_BRACE;
convData->isTargetUCharDBCS=FALSE;
}
*p++= cnv->subChar[0];
*p++= (char)cnv->subChars[0];
ucnv_cbFromUWriteBytes(args,
buffer, (int32_t)(p - buffer),

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2000-2005, International Business Machines
* Copyright (C) 2000-2006, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnvisci.c
@ -1319,7 +1319,7 @@ _ISCII_SafeClone(const UConverter *cnv,
}
localClone = (struct cloneISCIIStruct *)stackBuffer;
uprv_memcpy(&localClone->cnv, cnv, sizeof(UConverter));
/* ucnv.c/ucnv_safeClone() copied the main UConverter already */
uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataISCII));
localClone->cnv.extraInfo = &localClone->mydata;

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2000-2005, International Business Machines
* Copyright (C) 2000-2006, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -3877,7 +3877,7 @@ ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
length=1;
} else {
/* select subChar in all other cases */
subchar=(char *)cnv->subChar;
subchar=(char *)cnv->subChars;
length=cnv->subCharLen;
}

View file

@ -523,6 +523,8 @@ ucnv_close(UConverter * converter);
/**
* Fills in the output parameter, subChars, with the substitution characters
* as multiple bytes.
* If ucnv_setSubstString() set a Unicode string because the converter is
* stateful, then subChars will be an empty string.
*
* @param converter the Unicode converter
* @param subChars the subsitution characters
@ -531,6 +533,7 @@ ucnv_close(UConverter * converter);
* @param err the outgoing error status code.
* If the substitution character array is too small, an
* <TT>U_INDEX_OUTOFBOUNDS_ERROR</TT> will be returned.
* @see ucnv_setSubstString
* @see ucnv_setSubstChars
* @stable ICU 2.0
*/
@ -543,12 +546,19 @@ ucnv_getSubstChars(const UConverter *converter,
/**
* Sets the substitution chars when converting from unicode to a codepage. The
* substitution is specified as a string of 1-4 bytes, and may contain
* <TT>NULL</TT> byte.
* <TT>NULL</TT> bytes.
* The subChars must represent a single character. The caller needs to know the
* byte sequence of a valid character in the converter's charset.
* For some converters, for example some ISO 2022 variants, only single-byte
* substitution characters may be supported.
* The newer ucnv_setSubstString() function relaxes these limitations.
*
* @param converter the Unicode converter
* @param subChars the substitution character byte sequence we want set
* @param len the number of bytes in subChars
* @param err the error status code. <TT>U_INDEX_OUTOFBOUNDS_ERROR </TT> if
* len is bigger than the maximum number of bytes allowed in subchars
* @see ucnv_setSubstString
* @see ucnv_getSubstChars
* @stable ICU 2.0
*/
@ -558,6 +568,39 @@ ucnv_setSubstChars(UConverter *converter,
int8_t len,
UErrorCode *err);
/**
* Set a substitution string for converting from Unicode to a charset.
* The caller need not know the charset byte sequence for each charset.
*
* Unlike ucnv_setSubstChars() which is designed to set a charset byte sequence
* for a single character, this function takes a Unicode string with
* zero, one or more characters, and immediately verifies that the string can be
* converted to the charset.
* If not, or if the result is too long (more than 32 bytes as of ICU 3.6),
* then the function returns with an error accordingly.
*
* Also unlike ucnv_setSubstChars(), this function works for stateful charsets
* by converting on the fly at the point of substitution rather than setting
* a fixed byte sequence.
*
* @param cnv The UConverter object.
* @param s The Unicode string.
* @param length The number of UChars in s, or -1 for a NUL-terminated string.
* @param err Pointer to a standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
*
* @see ucnv_setSubstChars
* @see ucnv_getSubstChars
* @draft ICU 3.6
*/
U_DRAFT void U_EXPORT2
ucnv_setSubstString(UConverter *cnv,
const UChar *s,
int32_t length,
UErrorCode *err);
/**
* Fills in the output parameter, errBytes, with the error characters from the
* last failing conversion.
@ -1820,7 +1863,7 @@ ucnv_usesFallback(const UConverter *cnv);
* UErrorCode err = U_ZERO_ERROR;
* char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
* int32_t signatureLength = 0;
* char *encoding = ucnv_detectUnicodeSignatures(input,sizeof(input),&signatureLength,&err);
* char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
* UConverter *conv = NULL;
* UChar output[100];
* UChar *target = output, *out;

View file

@ -110,6 +110,7 @@ static void TestToUCountPending(void);
static void TestFromUCountPending(void);
static void TestDefaultName(void);
static void TestCompareNames(void);
static void TestSubstString(void);
void addTestConvert(TestNode** root);
@ -134,6 +135,7 @@ void addTestConvert(TestNode** root)
addTest(root, &TestFromUCountPending, "tsconv/ccapitst/TestFromUCountPending");
addTest(root, &TestDefaultName, "tsconv/ccapitst/TestDefaultName");
addTest(root, &TestCompareNames, "tsconv/ccapitst/TestCompareNames");
addTest(root, &TestSubstString, "tsconv/ccapitst/TestSubstString");
}
static void ListNames(void) {
@ -3105,3 +3107,93 @@ TestCompareNames() {
compareNames(lessMac);
compareNames(lessUTF080);
}
static void
TestSubstString() {
static const UChar surrogate[1]={ 0xd900 };
char buffer[16];
static const UChar sub[5]={ 0x61, 0x62, 0x63, 0x64, 0x65 };
static const char subChars[5]={ 0x61, 0x62, 0x63, 0x64, 0x65 };
UConverter *cnv;
UErrorCode errorCode;
int32_t length;
int8_t len8;
/* UTF-16/32: test that the BOM is output before the sub character */
errorCode=U_ZERO_ERROR;
cnv=ucnv_open("UTF-16", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("ucnv_open(UTF-16) failed - %s\n", u_errorName(errorCode));
return;
}
length=ucnv_fromUChars(cnv, buffer, (int32_t)sizeof(buffer), surrogate, 1, &errorCode);
ucnv_close(cnv);
if(U_FAILURE(errorCode) ||
length!=4 ||
NULL == ucnv_detectUnicodeSignature(buffer, length, NULL, &errorCode)
) {
log_err("ucnv_fromUChars(UTF-16, U+D900) did not write a BOM\n");
}
errorCode=U_ZERO_ERROR;
cnv=ucnv_open("UTF-32", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("ucnv_open(UTF-32) failed - %s\n", u_errorName(errorCode));
return;
}
length=ucnv_fromUChars(cnv, buffer, (int32_t)sizeof(buffer), surrogate, 1, &errorCode);
ucnv_close(cnv);
if(U_FAILURE(errorCode) ||
length!=8 ||
NULL == ucnv_detectUnicodeSignature(buffer, length, NULL, &errorCode)
) {
log_err("ucnv_fromUChars(UTF-32, U+D900) did not write a BOM\n");
}
/* Simple API test of ucnv_setSubstString() + ucnv_getSubstChars(). */
errorCode=U_ZERO_ERROR;
cnv=ucnv_open("ISO-8859-1", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("ucnv_open(ISO-8859-1) failed - %s\n", u_errorName(errorCode));
return;
}
ucnv_setSubstString(cnv, sub, LENGTHOF(sub), &errorCode);
if(U_FAILURE(errorCode)) {
log_err("ucnv_setSubstString(ISO-8859-1, sub[5]) failed - %s\n", u_errorName(errorCode));
} else {
len8 = sizeof(buffer);
ucnv_getSubstChars(cnv, buffer, &len8, &errorCode);
/* Stateless converter, we expect the string converted to charset bytes. */
if(U_FAILURE(errorCode) || len8!=sizeof(subChars) || 0!=uprv_memcmp(buffer, subChars, len8)) {
log_err("ucnv_getSubstChars(ucnv_setSubstString(ISO-8859-1, sub[5])) failed - %s\n", u_errorName(errorCode));
}
}
ucnv_close(cnv);
errorCode=U_ZERO_ERROR;
cnv=ucnv_open("HZ", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("ucnv_open(HZ) failed - %s\n", u_errorName(errorCode));
return;
}
ucnv_setSubstString(cnv, sub, LENGTHOF(sub), &errorCode);
if(U_FAILURE(errorCode)) {
log_err("ucnv_setSubstString(HZ, sub[5]) failed - %s\n", u_errorName(errorCode));
} else {
len8 = sizeof(buffer);
ucnv_getSubstChars(cnv, buffer, &len8, &errorCode);
/* Stateful converter, we expect that the Unicode string was set and that we get an empty char * string now. */
if(U_FAILURE(errorCode) || len8!=0) {
log_err("ucnv_getSubstChars(ucnv_setSubstString(HZ, sub[5])) failed - %s\n", u_errorName(errorCode));
}
}
ucnv_close(cnv);
/*
* Further testing of ucnv_setSubstString() is done via intltest convert.
* We do not test edge cases of illegal arguments and similar because the
* function implementation uses all of its parameters in calls to other
* functions with UErrorCode parameters.
*/
}

View file

@ -183,7 +183,7 @@ ConversionTest::TestFromUnicode() {
char charset[100], cbopt[4];
const char *option;
UnicodeString s, unicode, invalidUChars;
int32_t offsetsLength;
int32_t offsetsLength, index;
UConverterFromUCallback callback;
TestDataModule *dataModule;
@ -242,15 +242,17 @@ ConversionTest::TestFromUnicode() {
}
s=testCase->getString("callback", errorCode);
cc.setSub=0; // default: no subchar
// read NUL-separated subchar first, if any
length=u_strlen(p=s.getTerminatedBuffer());
if(++length<s.length()) {
if((index=s.indexOf((UChar)0))>0) {
// read NUL-separated subchar first, if any
// copy the subchar from Latin-1 characters
// start after the NUL
p=s.getTerminatedBuffer();
length=index+1;
p+=length;
length=s.length()-length;
if(length>=(int32_t)sizeof(cc.subchar)) {
if(length<=0 || length>=(int32_t)sizeof(cc.subchar)) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
} else {
int32_t j;
@ -260,13 +262,26 @@ ConversionTest::TestFromUnicode() {
}
// NUL-terminate the subchar
cc.subchar[j]=0;
cc.setSub=1;
}
// remove the NUL and subchar from s
s.truncate(u_strlen(s.getBuffer()));
} else {
// no subchar
cc.subchar[0]=0;
s.truncate(index);
} else if((index=s.indexOf((UChar)0x3d))>0) /* '=' */ {
// read a substitution string, separated by an equal sign
p=s.getBuffer()+index+1;
length=s.length()-(index+1);
if(length<=0 || length>=LENGTHOF(cc.subString)) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
} else {
u_memcpy(cc.subString, p, length);
// NUL-terminate the subString
cc.subString[length]=0;
cc.setSub=-1;
}
// remove the equal sign and subString from s
s.truncate(index);
}
s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
@ -1052,10 +1067,19 @@ ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback call
// set the subchar
int32_t length;
if((length=(int32_t)strlen(cc.subchar))!=0) {
if(cc.setSub>0) {
length=(int32_t)strlen(cc.subchar);
ucnv_setSubstChars(cnv, cc.subchar, (int8_t)length, &errorCode);
if(U_FAILURE(errorCode)) {
errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubChars() failed - %s",
errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstChars() failed - %s",
cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
ucnv_close(cnv);
return FALSE;
}
} else if(cc.setSub<0) {
ucnv_setSubstString(cnv, cc.subString, -1, &errorCode);
if(U_FAILURE(errorCode)) {
errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstString() failed - %s",
cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
ucnv_close(cnv);
return FALSE;

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Copyright (C) 2003-2006, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -29,7 +29,9 @@
struct ConversionCase {
int32_t caseNr;
const char *charset, *cbopt, *name;
UChar subString[16];
char subchar[8];
int8_t setSub;
const uint8_t *bytes;
int32_t bytesLength;

View file

@ -29,11 +29,16 @@ conversion:table(nofallback) {
"Callback option - a letter is passed in directly as const char * see ucnv_err.h\n"
"Empty string: Sub callback with NULL option\n"
"In order to specify a charset substitution character,\n"
"In order to specify a charset substitution character (for ucnv_setSubstChars()),\n"
"add a NUL (U+0000) to the callback string followed by the subchar bytes as Latin-1\n"
"characters. For example, for a Sub callback with no option and a subchar of FC FC,\n"
"use the string \"?\x00\xFC\xFC\"\n"
"In order to specify a substitution string (for ucnv_setSubstString()),\n"
"add an '=' to the callback string followed by the substitution string.\n"
"For example, for a Sub callback with no option and a substitution string\n"
"of \"ab\", use the string \"?=ab\"\n"
"fallbacks: per-direction boolean, currently only for fromUnicode; see Jitterbug 2401\n"
"errorCode: (empty)==zero | invalid | illegal | truncated | illesc | unsuppesc\n"
@ -445,6 +450,49 @@ conversion:table(nofallback) {
fromUnicode {
Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
Cases {
// Test substitution strings.
{
"windows-1252", // stateless MBCS with WriteSub(), should internally set char *
"a\ufdd0c",
:bin{ 61402421402463 },
:intvector{ 0,1,1,1,1,1,2 },
:int{1}, :int{0}, "", "?=@$!@$", ""
}
{
"ibm-930", // stateful MBCS
"a\ufdd0\u4e00\ufdd0e",
:bin{ 620e4bce0f400e45414bce0f4066 },
:intvector{ 0,1,1,1,1,1,2,2,2,3,3,3,3,4 },
:int{1}, :int{0}, "", "?=\u4e01 ", ""
}
{
"iso-2022-jp",
"a\x1bc", // Unicode ESC must not occur as a character
:bin{ 6163 },
:intvector{ 0,2 },
:int{1}, :int{0}, "", "?=", "" // empty substitution string
}
{
"iso-2022-cn",
"a\x1bc", // Unicode ESC must not occur as a character
:bin{ 61202063 },
:intvector{ 0,1,1,2 },
:int{1}, :int{0}, "", "?= ", ""
}
{
"iso-2022-cn",
"a\x1bc", // Unicode ESC must not occur as a character
:bin{ 611b2429410e523b0f2063 },
:intvector{ 0,1,1,1,1,1,1,1,1,1,2 },
:int{1}, :int{0}, "", "?=\u4e00 ", ""
}
{
"us-ascii",
"a\x85c",
:bin{ 61402421402463 },
:intvector{ 0,1,1,1,1,1,2 },
:int{1}, :int{0}, "", "?=@$!@$", ""
}
// ISO 2022-CN: test a single-byte subchar, j5171
{
"iso-2022-cn",