ICU-858 Implemented options recognition and output generation for SUB, SKIP and ESCAPE

callbacks

X-SVN-Rev: 3641
This commit is contained in:
Ram Viswanadha 2001-02-16 20:12:50 +00:00
parent a03a11d085
commit d53a33cd04
2 changed files with 283 additions and 63 deletions

View file

@ -24,11 +24,20 @@
#define VALUE_STRING_LENGTH 32
/*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
#define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
#define UNICODE_U_CODEPOINT 0x0055
#define UNICODE_X_CODEPOINT 0x0058
#define UNICODE_U_CODEPOINT 0x0055
#define UNICODE_X_CODEPOINT 0x0058
#define UNICODE_RS_CODEPOINT 0x005C
#define UNICODE_U_LOW_CODEPOINT 0x0075
#define UNICODE_X_LOW_CODEPOINT 0x0078
#define UNICODE_AMP_CODEPOINT 0x0026
#define UNICODE_HASH_CODEPOINT 0x0023
#define UCNV_PRV_ESCAPE_ICU NULL
#define UCNV_PRV_ESCAPE_C 'C'
#define UCNV_PRV_ESCAPE_XML_DEC 'D'
#define UCNV_PRV_ESCAPE_XML_HEX 'X'
#define UCNV_PRV_ESCAPE_JAVA 'J'
#define ToOffset(a) a<=9?(0x0030+a):(0x0030+a+7)
UBool
CONVERSION_U_SUCCESS (UErrorCode err)
@ -40,7 +49,8 @@ UBool
/*Takes a int32_t and fills in a UChar* string with that number "radix"-based
* and padded with "pad" zeroes
*/
static void itou (UChar * buffer, uint32_t i, uint32_t radix, int32_t pad)
#define MAX_DIGITS 10
static int32_t itou (UChar * buffer, uint32_t i, uint32_t radix, int32_t pad)
{
int32_t length = 0;
int32_t num = 0;
@ -48,28 +58,30 @@ static void itou (UChar * buffer, uint32_t i, uint32_t radix, int32_t pad)
int32_t j;
UChar temp;
while (i >= radix)
{
num = i / radix;
digit = (int8_t) (i - num * radix);
buffer[length++] = (UChar) (ToOffset (digit));
i = num;
}
do{
digit = (int)(i % radix);
buffer[length++]=(UChar)(digit<=9?(0x0030+digit):(0x0030+digit+7));
}while(i=i/radix);
buffer[length] = (UChar) (ToOffset (i));
while (length < pad)
buffer[length++] = (UChar) 0x0030;/*zero padding */
while (length < pad) buffer[++length] = (UChar) 0x0030; /*zero padding */
buffer[length--] = (UChar) 0x0000;
if(length<MAX_DIGITS){
buffer[length--] = (UChar) 0x0000;
}
num= (pad>=length) ? pad :length;
/*Reverses the string */
for (j = 0; j < (pad / 2); j++)
for (j = 0; j < (num / 2); j++)
{
temp = buffer[length - j];
buffer[length - j] = buffer[j];
buffer[j] = temp;
}
return;
/* truncates the padding */
return length+1;
}
/*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
@ -109,10 +121,30 @@ void UCNV_FROM_U_CALLBACK_SKIP (
UConverterCallbackReason reason,
UErrorCode * err)
{
if (reason <= UCNV_IRREGULAR)
{
*err = U_ZERO_ERROR;
}
if(context==NULL)
{
if (reason <= UCNV_IRREGULAR)
{
*err = U_ZERO_ERROR;
return;
}
}
else if(*(char*)context=='i')
{
if(reason != UCNV_UNASSIGNED)
{
/* the caller must have set
* the error code accordingly
*/
return;
}
else
{
*err = U_ZERO_ERROR;
return;
}
}
}
void UCNV_FROM_U_CALLBACK_SUBSTITUTE (
@ -124,14 +156,33 @@ void UCNV_FROM_U_CALLBACK_SUBSTITUTE (
UConverterCallbackReason reason,
UErrorCode * err)
{
if (reason > UCNV_IRREGULAR)
{
return;
}
*err = U_ZERO_ERROR;
ucnv_cbFromUWriteSub(fromArgs, 0, err);
if(context == NULL)
{
if (reason > UCNV_IRREGULAR)
{
return;
}
*err = U_ZERO_ERROR;
ucnv_cbFromUWriteSub(fromArgs, 0, err);
return;
}
else if(*((char*)context)=='i')
{
if(reason != UCNV_UNASSIGNED)
{
/* the caller must have set
* the error code accordingly
*/
return;
}
else
{
*err = U_ZERO_ERROR;
ucnv_cbFromUWriteSub(fromArgs, 0, err);
return;
}
}
}
/*uses itou to get a unicode escape sequence of the offensive sequence,
@ -160,38 +211,102 @@ void UCNV_FROM_U_CALLBACK_ESCAPE (
UConverterFromUCallback ignoredCallback = NULL;
void *ignoredContext;
if (reason > UCNV_IRREGULAR)
{
return;
}
ucnv_setFromUCallBack (fromArgs->converter,
(UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
NULL, /* To Do for HSYS: context is null? */
&original,
&originalContext,
&err2);
(UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
NULL, /* To Do for HSYS: context is null? */
&original,
&originalContext,
&err2);
if (U_FAILURE (err2))
{
*err = err2;
return;
}
if(context==NULL)
{
while (i < length)
{
valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
itou (valueString + valueStringLength, codeUnits[i++], 16, 4);
valueStringLength += 4;
}
}
/*
* ### TODO:
* This should actually really work with the codePoint, not with the codeUnits;
* how do we represent a code point > 0xffff? It should be one single escape, not
* two for a surrogate pair!
*/
while (i < length)
else
{
valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
itou (valueString + valueStringLength, codeUnits[i++], 16, 4);
valueStringLength += 4;
}
switch(*((char*)context))
{
case UCNV_PRV_ESCAPE_JAVA:
while (i < length)
{
valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
itou (valueString + valueStringLength, codeUnits[i++], 16, 4);
valueStringLength += 4;
}
break;
case UCNV_PRV_ESCAPE_C:
valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
if(length==2){
UChar32 temp = UTF16_GET_PAIR_VALUE(codeUnits[0],codeUnits[1]);
valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
valueStringLength += itou (valueString + valueStringLength, temp, 16, 8);
}
else{
valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
valueStringLength += itou (valueString + valueStringLength, codeUnits[0], 16, 4);
}
break;
case UCNV_PRV_ESCAPE_XML_DEC:
valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
if(length==2){
UChar32 temp = UTF16_GET_PAIR_VALUE(codeUnits[0],codeUnits[1]);
valueStringLength += itou (valueString + valueStringLength, temp, 10, 0);
}
else{
valueStringLength += itou (valueString + valueStringLength, codeUnits[0], 10, 4);
}
break;
case UCNV_PRV_ESCAPE_XML_HEX:
valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
if(length==2){
UChar32 temp = UTF16_GET_PAIR_VALUE(codeUnits[0],codeUnits[1]);
valueStringLength += itou (valueString + valueStringLength, temp, 16, 0);
}
else{
valueStringLength += itou (valueString + valueStringLength, codeUnits[0], 16, 4);
}
break;
default:
while (i < length)
{
valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
itou (valueString + valueStringLength, codeUnits[i++], 16, 4);
valueStringLength += 4;
}
}
}
myValueSource = valueString;
/* reset the error */
@ -224,9 +339,29 @@ void UCNV_TO_U_CALLBACK_SKIP (
UConverterCallbackReason reason,
UErrorCode * err)
{
if (reason <= UCNV_IRREGULAR)
if(context==NULL)
{
*err = U_ZERO_ERROR;
if (reason <= UCNV_IRREGULAR)
{
*err = U_ZERO_ERROR;
return;
}
}
else if(*((char*)context)=='i')
{
if(reason != UCNV_UNASSIGNED)
{
/* the caller must have set
* the error code accordingly
*/
return;
}
else
{
*err = U_ZERO_ERROR;
return;
}
}
}
@ -238,15 +373,34 @@ void UCNV_TO_U_CALLBACK_SUBSTITUTE (
UConverterCallbackReason reason,
UErrorCode * err)
{
if (reason > UCNV_IRREGULAR)
if(context == NULL)
{
if (reason > UCNV_IRREGULAR)
{
return;
}
*err = U_ZERO_ERROR;
ucnv_cbToUWriteSub(toArgs,0,err);
return;
}
*err = U_ZERO_ERROR;
ucnv_cbToUWriteSub(toArgs,0,err);
else if(*((char*)context)=='i')
{
if(reason != UCNV_UNASSIGNED)
{
/* the caller must have set
* the error code accordingly
*/
return;
}
else
{
*err = U_ZERO_ERROR;
ucnv_cbToUWriteSub(toArgs,0,err);
return;
}
}
return;
}
/*uses itou to get a unicode escape sequence of the offensive sequence,

View file

@ -60,6 +60,21 @@
#include "unicode/utypes.h"
/**
* FROM_U, TO_U options for sub and skip callbacks
*/
#define UCNV_SUB_STOP_ON_ILLEGAL "i"
#define UCNV_SKIP_STOP_ON_ILLEGAL "i"
/**
* FROM_U_CALLBACK_ESCAPE options
*/
#define UCNV_ESCAPE_ICU NULL
#define UCNV_ESCAPE_JAVA "J"
#define UCNV_ESCAPE_C "C"
#define UCNV_ESCAPE_XML_DEC "D"
#define UCNV_ESCAPE_XML_HEX "X"
/**
* The process condition code to be used with the callbacks.
*/
@ -132,6 +147,7 @@ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_STOP (
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This To Unicode callback STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
*
* @stable
*/
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP (
@ -144,8 +160,14 @@ U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP (
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This From Unicode callback skips any illegal sequence,
* This From Unicode callback skips any ILLEGAL_SEQUENCE, or
* skips only UNASSINGED_SEQUENCE depending on the context parameter
* simply ignoring those characters.
* @param context: the function currently recognizes the callback options:
* UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
* NULL: Skips any ILLEGAL_SEQUENCE
*
* @stable
*/
U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP (
@ -159,9 +181,14 @@ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP (
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the
* This From Unicode callback will Substitute the ILLEGAL SEQUENCE, or
* UNASSIGNED_SEQUENCE depending on context parameter, with the
* current substitution string for the converter. This is the default
* callback.
* @param context: the function currently recognizes the callback options:
* UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
* NULL: Substitutes any ILLEGAL_SEQUENCE
* @see ucnv_setSubstChars
* @stable
*/
@ -179,11 +206,39 @@ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the
* hexadecimal representation of the illegal codepoints
* (in the format %UXXXX, e.g. "%uFFFE%u00AC%uC8FE"). In the Event the
* converter doesn't support the characters {u,%}[A-F][0-9], it will
* substitute the illegal sequence with the substitution characters.
* Note that percent (%) was chosen because backslash (\) does not exist
* on many converters.
* @param context: the function currently recognizes the callback options:
*
* UCNV_ESCAPE_ICU: Substitues the ILLEGAL SEQUENCE with the hexadecimal
* representation in the format %UXXXX, e.g. "%uFFFE%u00AC%uC8FE").
* In the Event the converter doesn't support the characters {u,%}[A-F][0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* %UD84D%UDC56
* UCNV_ESCAPE_JAVA: Substitues the ILLEGAL SEQUENCE with the hexadecimal
* representation in the format \uXXXX, e.g. "\uFFFE\u00AC\uC8FE").
* In the Event the converter doesn't support the characters {u,\}[A-F][0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* \uD84D\uDC56
* UCNV_ESCAPE_C: Substitues the ILLEGAL SEQUENCE with the hexadecimal
* representation in the format \uXXXX, e.g. "\uFFFE\u00AC\uC8FE").
* In the Event the converter doesn't support the characters {u,U,\}[A-F][0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* \U00023456
* UCNV_ESCAPE_XML_DEC: Substitues the ILLEGAL SEQUENCE with the decimal
* representation in the format &#DDDDDDDD, e.g. "&#65534&#172&#51454").
* In the Event the converter doesn't support the characters {&,#}[0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* &#144470 and Zero padding is ignored.
* UCNV_ESCAPE_XML_HEX:Substitues the ILLEGAL SEQUENCE with the decimal
* representation in the format &#xXXXX, e.g. "&#xFFFE&#x00AC&#xC8FE").
* In the Event the converter doesn't support the characters {&,#,x}[0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* &#x23456
* @stable
*/
@ -199,8 +254,14 @@ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_ESCAPE (
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This To Unicode callback skips any illegal sequence,
* This To Unicode callback skips any ILLEGAL_SEQUENCE, or
* skips only UNASSINGED_SEQUENCE depending on the context parameter
* simply ignoring those characters.
* @param context: the function currently recognizes the callback options:
* UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
* NULL: Skips any ILLEGAL_SEQUENCE
*
* @stable
*/
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP (
@ -213,8 +274,13 @@ U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP (
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This To Unicode callback will Substitute the ILLEGAL SEQUENCE with the
* This To Unicode callback will Substitute the ILLEGAL SEQUENCE,or
* UNASSIGNED_SEQUENCE depending on context parameter, with the
* Unicode substitution character, U+FFFD.
* @param context: the function currently recognizes the callback options:
* UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
* NULL: Substitutes any ILLEGAL_SEQUENCE
* @stable
*/
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SUBSTITUTE (