ICU-858 Implemented options recognition and output generation for SUB, SKIP and ESCAPE

callbacks X-SVN-Rev: 3641
2025-04-21 12:40:02 +00:00 · 2001-02-16 20:12:50 +00:00 · 2001-02-16 20:12:50 +00:00 · d53a33cd04
commit d53a33cd04
parent a03a11d085
2 changed files with 283 additions and 63 deletions
--- a/icu4c/source/common/ucnv_err.c
+++ b/icu4c/source/common/ucnv_err.c
@ -24,11 +24,20 @@
 #define VALUE_STRING_LENGTH 32
 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
 #define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
-#define UNICODE_U_CODEPOINT 0x0055
-#define UNICODE_X_CODEPOINT 0x0058
+#define UNICODE_U_CODEPOINT     0x0055
+#define UNICODE_X_CODEPOINT     0x0058
+#define UNICODE_RS_CODEPOINT    0x005C
+#define UNICODE_U_LOW_CODEPOINT 0x0075
+#define UNICODE_X_LOW_CODEPOINT 0x0078
+#define UNICODE_AMP_CODEPOINT   0x0026
+#define UNICODE_HASH_CODEPOINT  0x0023

+#define UCNV_PRV_ESCAPE_ICU      NULL
+#define UCNV_PRV_ESCAPE_C       'C'
+#define UCNV_PRV_ESCAPE_XML_DEC 'D'
+#define UCNV_PRV_ESCAPE_XML_HEX 'X'
+#define UCNV_PRV_ESCAPE_JAVA    'J'

-#define ToOffset(a) a<=9?(0x0030+a):(0x0030+a+7)

 UBool 
  CONVERSION_U_SUCCESS (UErrorCode err)
@ -40,7 +49,8 @@ UBool
 /*Takes a int32_t and fills in  a UChar* string with that number "radix"-based
 * and padded with "pad" zeroes
 */
-static void   itou (UChar * buffer, uint32_t i, uint32_t radix, int32_t pad)
+#define MAX_DIGITS 10
+static  int32_t itou (UChar * buffer, uint32_t i, uint32_t radix, int32_t pad)
 {
  int32_t length = 0;
  int32_t num = 0;
@ -48,28 +58,30 @@ static void   itou (UChar * buffer, uint32_t i, uint32_t radix, int32_t pad)
  int32_t j;
  UChar temp;

-  while (i >= radix)
-    {
-      num = i / radix;
-      digit = (int8_t) (i - num * radix);
-      buffer[length++] = (UChar) (ToOffset (digit));
-      i = num;
-    }
+  do{
+      digit = (int)(i % radix);
+      buffer[length++]=(UChar)(digit<=9?(0x0030+digit):(0x0030+digit+7));
+  }while(i=i/radix);

-  buffer[length] = (UChar) (ToOffset (i));
+  while (length < pad)   
+      buffer[length++] = (UChar) 0x0030;/*zero padding */

-  while (length < pad)   buffer[++length] = (UChar) 0x0030;	/*zero padding */
-  buffer[length--] = (UChar) 0x0000;
+  if(length<MAX_DIGITS){
+       buffer[length--] = (UChar) 0x0000;
+  }
+  num= (pad>=length) ? pad :length;
  
  /*Reverses the string */
-  for (j = 0; j < (pad / 2); j++)
+  for (j = 0; j < (num / 2); j++)
    {
      temp = buffer[length - j];
      buffer[length - j] = buffer[j];
      buffer[j] = temp;
    }

-  return;
+  /* truncates the padding */
+  
+  return length+1;
 }

 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
@ -109,10 +121,30 @@ void   UCNV_FROM_U_CALLBACK_SKIP (
                  UConverterCallbackReason reason,
                  UErrorCode * err)
 {
-  if (reason <= UCNV_IRREGULAR)
-  {
-    *err = U_ZERO_ERROR;
-  }
+    if(context==NULL)
+    {
+        if (reason <= UCNV_IRREGULAR)
+        {
+            *err = U_ZERO_ERROR;
+            return;
+        }
+
+    }
+    else if(*(char*)context=='i')
+    {
+        if(reason != UCNV_UNASSIGNED)
+        {
+            /* the caller must have set 
+             * the error code accordingly
+             */
+            return;
+        }
+        else
+        {
+            *err = U_ZERO_ERROR;
+            return;
+        }
+    }
 }

 void   UCNV_FROM_U_CALLBACK_SUBSTITUTE (
@ -124,14 +156,33 @@ void   UCNV_FROM_U_CALLBACK_SUBSTITUTE (
                  UConverterCallbackReason reason,
                  UErrorCode * err)
 {
-  if (reason > UCNV_IRREGULAR)
-  {
-    return;
-  }
-  
-  *err = U_ZERO_ERROR;
-  
-  ucnv_cbFromUWriteSub(fromArgs, 0, err);
+    if(context == NULL)
+    {
+        if (reason > UCNV_IRREGULAR)
+        {
+            return;
+        }
+    
+        *err = U_ZERO_ERROR;
+        ucnv_cbFromUWriteSub(fromArgs, 0, err);
+        return;
+    }
+    else if(*((char*)context)=='i')
+    {
+        if(reason != UCNV_UNASSIGNED)
+        {
+            /* the caller must have set 
+             * the error code accordingly
+             */
+            return;
+        }
+        else
+        {
+            *err = U_ZERO_ERROR;
+            ucnv_cbFromUWriteSub(fromArgs, 0, err);
+            return;
+        }
+    }
 }

 /*uses itou to get a unicode escape sequence of the offensive sequence,
@ -160,38 +211,102 @@ void   UCNV_FROM_U_CALLBACK_ESCAPE (

  UConverterFromUCallback ignoredCallback = NULL;
  void *ignoredContext;
-
+  
  if (reason > UCNV_IRREGULAR)
  {
    return;
  }

  ucnv_setFromUCallBack (fromArgs->converter,
-             (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
-             NULL,  /* To Do for HSYS: context is null? */
-             &original,
-             &originalContext,
-             &err2);
+                     (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
+                     NULL,  /* To Do for HSYS: context is null? */
+                     &original,
+                     &originalContext,
+                     &err2);
+  
  if (U_FAILURE (err2))
  {
    *err = err2;
    return;
+  } 
+  if(context==NULL)
+  { 
+      while (i < length)
+      {
+        valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT;	/* adding % */
+        valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT;	/* adding U */
+        itou (valueString + valueStringLength, codeUnits[i++], 16, 4);
+        valueStringLength += 4;
+      }
  }
-  
-  /*
-   * ### TODO:
-   * This should actually really work with the codePoint, not with the codeUnits;
-   * how do we represent a code point > 0xffff? It should be one single escape, not
-   * two for a surrogate pair!
-   */
-  while (i < length)
+  else
  {
-    valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT;	/* adding % */
-    valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT;	/* adding U */
-    itou (valueString + valueStringLength, codeUnits[i++], 16, 4);
-    valueStringLength += 4;
-  }
+      switch(*((char*)context))
+      {
+        case UCNV_PRV_ESCAPE_JAVA:
+          while (i < length)
+          {
+            valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
+            valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT;	/* adding u */
+            itou (valueString + valueStringLength, codeUnits[i++], 16, 4);
+            valueStringLength += 4;
+          }
+          break;

+        case UCNV_PRV_ESCAPE_C:
+            valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;	/* adding \ */
+
+            if(length==2){
+                UChar32 temp = UTF16_GET_PAIR_VALUE(codeUnits[0],codeUnits[1]);
+                valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT;	/* adding u */
+                valueStringLength += itou (valueString + valueStringLength, temp, 16, 8);
+                
+            }
+            else{
+                valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT;	/* adding U */
+                valueStringLength += itou (valueString + valueStringLength, codeUnits[0], 16, 4);
+            }
+          break;
+
+        case UCNV_PRV_ESCAPE_XML_DEC:
+    
+            valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;	/* adding & */
+            valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;	/* adding # */
+            if(length==2){
+                UChar32 temp = UTF16_GET_PAIR_VALUE(codeUnits[0],codeUnits[1]);
+                valueStringLength += itou (valueString + valueStringLength, temp, 10, 0);
+                
+            }
+            else{
+                valueStringLength += itou (valueString + valueStringLength, codeUnits[0], 10, 4);
+            }
+          break;
+
+        case UCNV_PRV_ESCAPE_XML_HEX:
+
+            valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;	/* adding & */
+            valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;	/* adding # */
+            valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
+            if(length==2){
+                UChar32 temp = UTF16_GET_PAIR_VALUE(codeUnits[0],codeUnits[1]);
+                valueStringLength += itou (valueString + valueStringLength, temp, 16, 0);
+                
+            }
+            else{
+                valueStringLength += itou (valueString + valueStringLength, codeUnits[0], 16, 4);
+            }
+          break;
+       default:
+          while (i < length)
+          {
+            valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT;	/* adding % */
+            valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT;	            /* adding U */
+            itou (valueString + valueStringLength, codeUnits[i++], 16, 4);
+            valueStringLength += 4;
+          }
+      }
+
+  }  
  myValueSource = valueString;

  /* reset the error */
@ -224,9 +339,29 @@ void UCNV_TO_U_CALLBACK_SKIP (
                 UConverterCallbackReason reason,
                 UErrorCode * err)
 {
-    if (reason <= UCNV_IRREGULAR)
+    if(context==NULL)
    {
-        *err = U_ZERO_ERROR;
+        if (reason <= UCNV_IRREGULAR)
+        {
+            *err = U_ZERO_ERROR;
+            return;
+        }
+
+    }
+    else if(*((char*)context)=='i')
+    {
+        if(reason != UCNV_UNASSIGNED)
+        {
+            /* the caller must have set 
+             * the error code accordingly
+             */
+            return;
+        }
+        else
+        {
+            *err = U_ZERO_ERROR;
+            return;
+        }
    }
 }

@ -238,15 +373,34 @@ void   UCNV_TO_U_CALLBACK_SUBSTITUTE (
                 UConverterCallbackReason reason,
                 UErrorCode * err)
 {
-    if (reason > UCNV_IRREGULAR)
+    if(context == NULL)
    {
+        if (reason > UCNV_IRREGULAR)
+        {
+            return;
+        }
+    
+        *err = U_ZERO_ERROR;
+        ucnv_cbToUWriteSub(toArgs,0,err);
        return;
    }
-    
-    *err = U_ZERO_ERROR;
-    ucnv_cbToUWriteSub(toArgs,0,err);
+    else if(*((char*)context)=='i')
+    {
+        if(reason != UCNV_UNASSIGNED)
+        {
+            /* the caller must have set 
+             * the error code accordingly
+             */
+            return;
+        }
+        else
+        {
+            *err = U_ZERO_ERROR;
+            ucnv_cbToUWriteSub(toArgs,0,err);
+            return;
+        }
+    }

-    return;
 }

 /*uses itou to get a unicode escape sequence of the offensive sequence,
--- a/icu4c/source/common/unicode/ucnv_err.h
+++ b/icu4c/source/common/unicode/ucnv_err.h
@ -60,6 +60,21 @@

 #include "unicode/utypes.h"

+/**
+ * FROM_U, TO_U options for sub and skip callbacks
+ */
+#define UCNV_SUB_STOP_ON_ILLEGAL "i"
+#define UCNV_SKIP_STOP_ON_ILLEGAL "i"
+
+/**
+ * FROM_U_CALLBACK_ESCAPE options
+ */
+#define UCNV_ESCAPE_ICU     NULL
+#define UCNV_ESCAPE_JAVA    "J"
+#define UCNV_ESCAPE_C       "C"
+#define UCNV_ESCAPE_XML_DEC "D"
+#define UCNV_ESCAPE_XML_HEX "X"
+
 /** 
 * The process condition code to be used with the callbacks.  
 */
@ -132,6 +147,7 @@ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_STOP (
 * DO NOT CALL THIS FUNCTION DIRECTLY!
 * This To Unicode callback STOPS at the ILLEGAL_SEQUENCE,
 * returning the error code back to the caller immediately.
+ * 
 * @stable
 */
 U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP (
@ -144,8 +160,14 @@ U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP (

 /**
 * DO NOT CALL THIS FUNCTION DIRECTLY!
- * This From Unicode callback skips any illegal sequence, 
+ * This From Unicode callback skips any ILLEGAL_SEQUENCE, or
+ * skips only UNASSINGED_SEQUENCE depending on the context parameter
 * simply ignoring those characters. 
+ * @param context: the function currently recognizes the callback options:
+ *                 UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
+ *                      returning the error code back to the caller immediately.
+ *                 NULL: Skips any ILLEGAL_SEQUENCE
+ *                     
 * @stable
 */
 U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP (
@ -159,9 +181,14 @@ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP (

 /**
 * DO NOT CALL THIS FUNCTION DIRECTLY!
- * This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the
+ * This From Unicode callback will Substitute the ILLEGAL SEQUENCE, or 
+ * UNASSIGNED_SEQUENCE depending on context parameter, with the
 * current substitution string for the converter. This is the default
 * callback.
+ * @param context: the function currently recognizes the callback options:
+ *                 UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
+ *                      returning the error code back to the caller immediately.
+ *                 NULL: Substitutes any ILLEGAL_SEQUENCE
 * @see ucnv_setSubstChars
 * @stable
 */
@ -179,11 +206,39 @@ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
 * DO NOT CALL THIS FUNCTION DIRECTLY!
 * This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the
 * hexadecimal representation of the illegal codepoints
- *  (in the format  %UXXXX, e.g. "%uFFFE%u00AC%uC8FE"). In the Event the
- * converter doesn't support the characters {u,%}[A-F][0-9], it will 
- * substitute  the illegal sequence with the substitution characters.
- * Note that percent (%) was chosen because backslash (\) does not exist
- * on many converters.
+
+ * @param context: the function currently recognizes the callback options:
+ *        
+ *        UCNV_ESCAPE_ICU: Substitues the  ILLEGAL SEQUENCE with the hexadecimal 
+ *          representation in the format  %UXXXX, e.g. "%uFFFE%u00AC%uC8FE"). 
+ *          In the Event the converter doesn't support the characters {u,%}[A-F][0-9], 
+ *          it will  substitute  the illegal sequence with the substitution characters.
+ *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
+ *          %UD84D%UDC56
+ *        UCNV_ESCAPE_JAVA: Substitues the  ILLEGAL SEQUENCE with the hexadecimal 
+ *          representation in the format  \uXXXX, e.g. "\uFFFE\u00AC\uC8FE"). 
+ *          In the Event the converter doesn't support the characters {u,\}[A-F][0-9], 
+ *          it will  substitute  the illegal sequence with the substitution characters.
+ *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
+ *          \uD84D\uDC56
+ *        UCNV_ESCAPE_C: Substitues the  ILLEGAL SEQUENCE with the hexadecimal 
+ *          representation in the format  \uXXXX, e.g. "\uFFFE\u00AC\uC8FE"). 
+ *          In the Event the converter doesn't support the characters {u,U,\}[A-F][0-9], 
+ *          it will  substitute  the illegal sequence with the substitution characters.
+ *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
+ *          \U00023456
+ *        UCNV_ESCAPE_XML_DEC: Substitues the  ILLEGAL SEQUENCE with the decimal 
+ *          representation in the format  &#DDDDDDDD, e.g. "&#65534&#172&#51454"). 
+ *          In the Event the converter doesn't support the characters {&,#}[0-9], 
+ *          it will  substitute  the illegal sequence with the substitution characters.
+ *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
+ *          &#144470 and Zero padding is ignored.
+ *        UCNV_ESCAPE_XML_HEX:Substitues the  ILLEGAL SEQUENCE with the decimal 
+ *          representation in the format  &#xXXXX, e.g. "&#xFFFE&#x00AC&#xC8FE"). 
+ *          In the Event the converter doesn't support the characters {&,#,x}[0-9], 
+ *          it will  substitute  the illegal sequence with the substitution characters.
+ *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
+ *          &#x23456
 * @stable
 */

@ -199,8 +254,14 @@ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_ESCAPE (

 /**
 * DO NOT CALL THIS FUNCTION DIRECTLY!
- * This To Unicode callback skips any illegal sequence, 
+ * This To Unicode callback skips any ILLEGAL_SEQUENCE, or
+ * skips only UNASSINGED_SEQUENCE depending on the context parameter
 * simply ignoring those characters. 
+ * @param context: the function currently recognizes the callback options:
+ *                 UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
+ *                      returning the error code back to the caller immediately.
+ *                 NULL: Skips any ILLEGAL_SEQUENCE
+ *                     
 * @stable
 */
 U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP (
@ -213,8 +274,13 @@ U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP (

 /**
 * DO NOT CALL THIS FUNCTION DIRECTLY!
- * This To Unicode callback will Substitute the ILLEGAL SEQUENCE with the
+ * This To Unicode callback will Substitute the ILLEGAL SEQUENCE,or 
+ * UNASSIGNED_SEQUENCE depending on context parameter,  with the
 * Unicode substitution character, U+FFFD.
+ * @param context: the function currently recognizes the callback options:
+ *                 UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
+ *                      returning the error code back to the caller immediately.
+ *                 NULL: Substitutes any ILLEGAL_SEQUENCE
 * @stable
 */
 U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SUBSTITUTE (