mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-21 12:40:02 +00:00
ICU-858 Implemented options recognition and output generation for SUB, SKIP and ESCAPE
callbacks X-SVN-Rev: 3641
This commit is contained in:
parent
a03a11d085
commit
d53a33cd04
2 changed files with 283 additions and 63 deletions
|
@ -24,11 +24,20 @@
|
|||
#define VALUE_STRING_LENGTH 32
|
||||
/*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
|
||||
#define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
|
||||
#define UNICODE_U_CODEPOINT 0x0055
|
||||
#define UNICODE_X_CODEPOINT 0x0058
|
||||
#define UNICODE_U_CODEPOINT 0x0055
|
||||
#define UNICODE_X_CODEPOINT 0x0058
|
||||
#define UNICODE_RS_CODEPOINT 0x005C
|
||||
#define UNICODE_U_LOW_CODEPOINT 0x0075
|
||||
#define UNICODE_X_LOW_CODEPOINT 0x0078
|
||||
#define UNICODE_AMP_CODEPOINT 0x0026
|
||||
#define UNICODE_HASH_CODEPOINT 0x0023
|
||||
|
||||
#define UCNV_PRV_ESCAPE_ICU NULL
|
||||
#define UCNV_PRV_ESCAPE_C 'C'
|
||||
#define UCNV_PRV_ESCAPE_XML_DEC 'D'
|
||||
#define UCNV_PRV_ESCAPE_XML_HEX 'X'
|
||||
#define UCNV_PRV_ESCAPE_JAVA 'J'
|
||||
|
||||
#define ToOffset(a) a<=9?(0x0030+a):(0x0030+a+7)
|
||||
|
||||
UBool
|
||||
CONVERSION_U_SUCCESS (UErrorCode err)
|
||||
|
@ -40,7 +49,8 @@ UBool
|
|||
/*Takes a int32_t and fills in a UChar* string with that number "radix"-based
|
||||
* and padded with "pad" zeroes
|
||||
*/
|
||||
static void itou (UChar * buffer, uint32_t i, uint32_t radix, int32_t pad)
|
||||
#define MAX_DIGITS 10
|
||||
static int32_t itou (UChar * buffer, uint32_t i, uint32_t radix, int32_t pad)
|
||||
{
|
||||
int32_t length = 0;
|
||||
int32_t num = 0;
|
||||
|
@ -48,28 +58,30 @@ static void itou (UChar * buffer, uint32_t i, uint32_t radix, int32_t pad)
|
|||
int32_t j;
|
||||
UChar temp;
|
||||
|
||||
while (i >= radix)
|
||||
{
|
||||
num = i / radix;
|
||||
digit = (int8_t) (i - num * radix);
|
||||
buffer[length++] = (UChar) (ToOffset (digit));
|
||||
i = num;
|
||||
}
|
||||
do{
|
||||
digit = (int)(i % radix);
|
||||
buffer[length++]=(UChar)(digit<=9?(0x0030+digit):(0x0030+digit+7));
|
||||
}while(i=i/radix);
|
||||
|
||||
buffer[length] = (UChar) (ToOffset (i));
|
||||
while (length < pad)
|
||||
buffer[length++] = (UChar) 0x0030;/*zero padding */
|
||||
|
||||
while (length < pad) buffer[++length] = (UChar) 0x0030; /*zero padding */
|
||||
buffer[length--] = (UChar) 0x0000;
|
||||
if(length<MAX_DIGITS){
|
||||
buffer[length--] = (UChar) 0x0000;
|
||||
}
|
||||
num= (pad>=length) ? pad :length;
|
||||
|
||||
/*Reverses the string */
|
||||
for (j = 0; j < (pad / 2); j++)
|
||||
for (j = 0; j < (num / 2); j++)
|
||||
{
|
||||
temp = buffer[length - j];
|
||||
buffer[length - j] = buffer[j];
|
||||
buffer[j] = temp;
|
||||
}
|
||||
|
||||
return;
|
||||
/* truncates the padding */
|
||||
|
||||
return length+1;
|
||||
}
|
||||
|
||||
/*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
|
||||
|
@ -109,10 +121,30 @@ void UCNV_FROM_U_CALLBACK_SKIP (
|
|||
UConverterCallbackReason reason,
|
||||
UErrorCode * err)
|
||||
{
|
||||
if (reason <= UCNV_IRREGULAR)
|
||||
{
|
||||
*err = U_ZERO_ERROR;
|
||||
}
|
||||
if(context==NULL)
|
||||
{
|
||||
if (reason <= UCNV_IRREGULAR)
|
||||
{
|
||||
*err = U_ZERO_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
else if(*(char*)context=='i')
|
||||
{
|
||||
if(reason != UCNV_UNASSIGNED)
|
||||
{
|
||||
/* the caller must have set
|
||||
* the error code accordingly
|
||||
*/
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
*err = U_ZERO_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void UCNV_FROM_U_CALLBACK_SUBSTITUTE (
|
||||
|
@ -124,14 +156,33 @@ void UCNV_FROM_U_CALLBACK_SUBSTITUTE (
|
|||
UConverterCallbackReason reason,
|
||||
UErrorCode * err)
|
||||
{
|
||||
if (reason > UCNV_IRREGULAR)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
*err = U_ZERO_ERROR;
|
||||
|
||||
ucnv_cbFromUWriteSub(fromArgs, 0, err);
|
||||
if(context == NULL)
|
||||
{
|
||||
if (reason > UCNV_IRREGULAR)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
*err = U_ZERO_ERROR;
|
||||
ucnv_cbFromUWriteSub(fromArgs, 0, err);
|
||||
return;
|
||||
}
|
||||
else if(*((char*)context)=='i')
|
||||
{
|
||||
if(reason != UCNV_UNASSIGNED)
|
||||
{
|
||||
/* the caller must have set
|
||||
* the error code accordingly
|
||||
*/
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
*err = U_ZERO_ERROR;
|
||||
ucnv_cbFromUWriteSub(fromArgs, 0, err);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*uses itou to get a unicode escape sequence of the offensive sequence,
|
||||
|
@ -160,38 +211,102 @@ void UCNV_FROM_U_CALLBACK_ESCAPE (
|
|||
|
||||
UConverterFromUCallback ignoredCallback = NULL;
|
||||
void *ignoredContext;
|
||||
|
||||
|
||||
if (reason > UCNV_IRREGULAR)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
ucnv_setFromUCallBack (fromArgs->converter,
|
||||
(UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
|
||||
NULL, /* To Do for HSYS: context is null? */
|
||||
&original,
|
||||
&originalContext,
|
||||
&err2);
|
||||
(UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
|
||||
NULL, /* To Do for HSYS: context is null? */
|
||||
&original,
|
||||
&originalContext,
|
||||
&err2);
|
||||
|
||||
if (U_FAILURE (err2))
|
||||
{
|
||||
*err = err2;
|
||||
return;
|
||||
}
|
||||
if(context==NULL)
|
||||
{
|
||||
while (i < length)
|
||||
{
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
|
||||
itou (valueString + valueStringLength, codeUnits[i++], 16, 4);
|
||||
valueStringLength += 4;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* ### TODO:
|
||||
* This should actually really work with the codePoint, not with the codeUnits;
|
||||
* how do we represent a code point > 0xffff? It should be one single escape, not
|
||||
* two for a surrogate pair!
|
||||
*/
|
||||
while (i < length)
|
||||
else
|
||||
{
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
|
||||
itou (valueString + valueStringLength, codeUnits[i++], 16, 4);
|
||||
valueStringLength += 4;
|
||||
}
|
||||
switch(*((char*)context))
|
||||
{
|
||||
case UCNV_PRV_ESCAPE_JAVA:
|
||||
while (i < length)
|
||||
{
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
|
||||
itou (valueString + valueStringLength, codeUnits[i++], 16, 4);
|
||||
valueStringLength += 4;
|
||||
}
|
||||
break;
|
||||
|
||||
case UCNV_PRV_ESCAPE_C:
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
|
||||
|
||||
if(length==2){
|
||||
UChar32 temp = UTF16_GET_PAIR_VALUE(codeUnits[0],codeUnits[1]);
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
|
||||
valueStringLength += itou (valueString + valueStringLength, temp, 16, 8);
|
||||
|
||||
}
|
||||
else{
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
|
||||
valueStringLength += itou (valueString + valueStringLength, codeUnits[0], 16, 4);
|
||||
}
|
||||
break;
|
||||
|
||||
case UCNV_PRV_ESCAPE_XML_DEC:
|
||||
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
|
||||
if(length==2){
|
||||
UChar32 temp = UTF16_GET_PAIR_VALUE(codeUnits[0],codeUnits[1]);
|
||||
valueStringLength += itou (valueString + valueStringLength, temp, 10, 0);
|
||||
|
||||
}
|
||||
else{
|
||||
valueStringLength += itou (valueString + valueStringLength, codeUnits[0], 10, 4);
|
||||
}
|
||||
break;
|
||||
|
||||
case UCNV_PRV_ESCAPE_XML_HEX:
|
||||
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
|
||||
if(length==2){
|
||||
UChar32 temp = UTF16_GET_PAIR_VALUE(codeUnits[0],codeUnits[1]);
|
||||
valueStringLength += itou (valueString + valueStringLength, temp, 16, 0);
|
||||
|
||||
}
|
||||
else{
|
||||
valueStringLength += itou (valueString + valueStringLength, codeUnits[0], 16, 4);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
while (i < length)
|
||||
{
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
|
||||
valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
|
||||
itou (valueString + valueStringLength, codeUnits[i++], 16, 4);
|
||||
valueStringLength += 4;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
myValueSource = valueString;
|
||||
|
||||
/* reset the error */
|
||||
|
@ -224,9 +339,29 @@ void UCNV_TO_U_CALLBACK_SKIP (
|
|||
UConverterCallbackReason reason,
|
||||
UErrorCode * err)
|
||||
{
|
||||
if (reason <= UCNV_IRREGULAR)
|
||||
if(context==NULL)
|
||||
{
|
||||
*err = U_ZERO_ERROR;
|
||||
if (reason <= UCNV_IRREGULAR)
|
||||
{
|
||||
*err = U_ZERO_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
else if(*((char*)context)=='i')
|
||||
{
|
||||
if(reason != UCNV_UNASSIGNED)
|
||||
{
|
||||
/* the caller must have set
|
||||
* the error code accordingly
|
||||
*/
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
*err = U_ZERO_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -238,15 +373,34 @@ void UCNV_TO_U_CALLBACK_SUBSTITUTE (
|
|||
UConverterCallbackReason reason,
|
||||
UErrorCode * err)
|
||||
{
|
||||
if (reason > UCNV_IRREGULAR)
|
||||
if(context == NULL)
|
||||
{
|
||||
if (reason > UCNV_IRREGULAR)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
*err = U_ZERO_ERROR;
|
||||
ucnv_cbToUWriteSub(toArgs,0,err);
|
||||
return;
|
||||
}
|
||||
|
||||
*err = U_ZERO_ERROR;
|
||||
ucnv_cbToUWriteSub(toArgs,0,err);
|
||||
else if(*((char*)context)=='i')
|
||||
{
|
||||
if(reason != UCNV_UNASSIGNED)
|
||||
{
|
||||
/* the caller must have set
|
||||
* the error code accordingly
|
||||
*/
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
*err = U_ZERO_ERROR;
|
||||
ucnv_cbToUWriteSub(toArgs,0,err);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*uses itou to get a unicode escape sequence of the offensive sequence,
|
||||
|
|
|
@ -60,6 +60,21 @@
|
|||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
/**
|
||||
* FROM_U, TO_U options for sub and skip callbacks
|
||||
*/
|
||||
#define UCNV_SUB_STOP_ON_ILLEGAL "i"
|
||||
#define UCNV_SKIP_STOP_ON_ILLEGAL "i"
|
||||
|
||||
/**
|
||||
* FROM_U_CALLBACK_ESCAPE options
|
||||
*/
|
||||
#define UCNV_ESCAPE_ICU NULL
|
||||
#define UCNV_ESCAPE_JAVA "J"
|
||||
#define UCNV_ESCAPE_C "C"
|
||||
#define UCNV_ESCAPE_XML_DEC "D"
|
||||
#define UCNV_ESCAPE_XML_HEX "X"
|
||||
|
||||
/**
|
||||
* The process condition code to be used with the callbacks.
|
||||
*/
|
||||
|
@ -132,6 +147,7 @@ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_STOP (
|
|||
* DO NOT CALL THIS FUNCTION DIRECTLY!
|
||||
* This To Unicode callback STOPS at the ILLEGAL_SEQUENCE,
|
||||
* returning the error code back to the caller immediately.
|
||||
*
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP (
|
||||
|
@ -144,8 +160,14 @@ U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP (
|
|||
|
||||
/**
|
||||
* DO NOT CALL THIS FUNCTION DIRECTLY!
|
||||
* This From Unicode callback skips any illegal sequence,
|
||||
* This From Unicode callback skips any ILLEGAL_SEQUENCE, or
|
||||
* skips only UNASSINGED_SEQUENCE depending on the context parameter
|
||||
* simply ignoring those characters.
|
||||
* @param context: the function currently recognizes the callback options:
|
||||
* UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
|
||||
* returning the error code back to the caller immediately.
|
||||
* NULL: Skips any ILLEGAL_SEQUENCE
|
||||
*
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP (
|
||||
|
@ -159,9 +181,14 @@ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP (
|
|||
|
||||
/**
|
||||
* DO NOT CALL THIS FUNCTION DIRECTLY!
|
||||
* This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the
|
||||
* This From Unicode callback will Substitute the ILLEGAL SEQUENCE, or
|
||||
* UNASSIGNED_SEQUENCE depending on context parameter, with the
|
||||
* current substitution string for the converter. This is the default
|
||||
* callback.
|
||||
* @param context: the function currently recognizes the callback options:
|
||||
* UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
|
||||
* returning the error code back to the caller immediately.
|
||||
* NULL: Substitutes any ILLEGAL_SEQUENCE
|
||||
* @see ucnv_setSubstChars
|
||||
* @stable
|
||||
*/
|
||||
|
@ -179,11 +206,39 @@ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
|
|||
* DO NOT CALL THIS FUNCTION DIRECTLY!
|
||||
* This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the
|
||||
* hexadecimal representation of the illegal codepoints
|
||||
* (in the format %UXXXX, e.g. "%uFFFE%u00AC%uC8FE"). In the Event the
|
||||
* converter doesn't support the characters {u,%}[A-F][0-9], it will
|
||||
* substitute the illegal sequence with the substitution characters.
|
||||
* Note that percent (%) was chosen because backslash (\) does not exist
|
||||
* on many converters.
|
||||
|
||||
* @param context: the function currently recognizes the callback options:
|
||||
*
|
||||
* UCNV_ESCAPE_ICU: Substitues the ILLEGAL SEQUENCE with the hexadecimal
|
||||
* representation in the format %UXXXX, e.g. "%uFFFE%u00AC%uC8FE").
|
||||
* In the Event the converter doesn't support the characters {u,%}[A-F][0-9],
|
||||
* it will substitute the illegal sequence with the substitution characters.
|
||||
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
|
||||
* %UD84D%UDC56
|
||||
* UCNV_ESCAPE_JAVA: Substitues the ILLEGAL SEQUENCE with the hexadecimal
|
||||
* representation in the format \uXXXX, e.g. "\uFFFE\u00AC\uC8FE").
|
||||
* In the Event the converter doesn't support the characters {u,\}[A-F][0-9],
|
||||
* it will substitute the illegal sequence with the substitution characters.
|
||||
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
|
||||
* \uD84D\uDC56
|
||||
* UCNV_ESCAPE_C: Substitues the ILLEGAL SEQUENCE with the hexadecimal
|
||||
* representation in the format \uXXXX, e.g. "\uFFFE\u00AC\uC8FE").
|
||||
* In the Event the converter doesn't support the characters {u,U,\}[A-F][0-9],
|
||||
* it will substitute the illegal sequence with the substitution characters.
|
||||
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
|
||||
* \U00023456
|
||||
* UCNV_ESCAPE_XML_DEC: Substitues the ILLEGAL SEQUENCE with the decimal
|
||||
* representation in the format &#DDDDDDDD, e.g. "¬죾").
|
||||
* In the Event the converter doesn't support the characters {&,#}[0-9],
|
||||
* it will substitute the illegal sequence with the substitution characters.
|
||||
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
|
||||
* 𣑖 and Zero padding is ignored.
|
||||
* UCNV_ESCAPE_XML_HEX:Substitues the ILLEGAL SEQUENCE with the decimal
|
||||
* representation in the format &#xXXXX, e.g. "¬죾").
|
||||
* In the Event the converter doesn't support the characters {&,#,x}[0-9],
|
||||
* it will substitute the illegal sequence with the substitution characters.
|
||||
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
|
||||
* 𣑖
|
||||
* @stable
|
||||
*/
|
||||
|
||||
|
@ -199,8 +254,14 @@ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_ESCAPE (
|
|||
|
||||
/**
|
||||
* DO NOT CALL THIS FUNCTION DIRECTLY!
|
||||
* This To Unicode callback skips any illegal sequence,
|
||||
* This To Unicode callback skips any ILLEGAL_SEQUENCE, or
|
||||
* skips only UNASSINGED_SEQUENCE depending on the context parameter
|
||||
* simply ignoring those characters.
|
||||
* @param context: the function currently recognizes the callback options:
|
||||
* UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
|
||||
* returning the error code back to the caller immediately.
|
||||
* NULL: Skips any ILLEGAL_SEQUENCE
|
||||
*
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP (
|
||||
|
@ -213,8 +274,13 @@ U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP (
|
|||
|
||||
/**
|
||||
* DO NOT CALL THIS FUNCTION DIRECTLY!
|
||||
* This To Unicode callback will Substitute the ILLEGAL SEQUENCE with the
|
||||
* This To Unicode callback will Substitute the ILLEGAL SEQUENCE,or
|
||||
* UNASSIGNED_SEQUENCE depending on context parameter, with the
|
||||
* Unicode substitution character, U+FFFD.
|
||||
* @param context: the function currently recognizes the callback options:
|
||||
* UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
|
||||
* returning the error code back to the caller immediately.
|
||||
* NULL: Substitutes any ILLEGAL_SEQUENCE
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SUBSTITUTE (
|
||||
|
|
Loading…
Add table
Reference in a new issue