ICU-103 replace the mbcs implementation by one that supports up to 4 bytes/char and full utf-16

X-SVN-Rev: 1830
2025-04-08 06:53:45 +00:00 · 2000-07-13 00:10:29 +00:00 · 2000-07-13 00:10:29 +00:00 · f0b6b788f2
commit f0b6b788f2
parent 8401b1c498
4 changed files with 1447 additions and 763 deletions
--- a/icu4c/source/common/ucnv_cnv.h
+++ b/icu4c/source/common/ucnv_cnv.h
@ -21,6 +21,7 @@
 #include "unicode/utypes.h"
 #include "unicode/ucnv_err.h"
 #include "ucnv_bld.h"
+#include "ucnvmbcs.h"
 #include "ucmp8.h"
 #include "ucmp16.h"

@ -43,16 +44,6 @@ typedef struct
  }
 UConverterDBCSTable;

-typedef struct
-  {
-    UBool *starters; /* [256]; */
-    CompactShortArray toUnicode;
-    CompactShortArray fromUnicode;
-    CompactShortArray toUnicodeFallback;
-    CompactShortArray fromUnicodeFallback;
-  }
-UConverterMBCSTable;
-
 union UConverterTable
  {
    UConverterSBCSTable sbcs;
@ -141,8 +132,7 @@ U_CDECL_BEGIN
                 args->offsets = saveOffsets; \
                  for (;My_i < myTargetIndex;My_i++) {args->offsets[My_i] += currentOffset  ;   } \
                }
-/*
-*/
+

 typedef void (*UConverterLoad) (UConverterSharedData *sharedData, const uint8_t *raw, UErrorCode *pErrorCode);
 typedef void (*UConverterUnload) (UConverterSharedData *sharedData);
--- a/icu4c/source/common/ucnv_lmb.c
+++ b/icu4c/source/common/ucnv_lmb.c
@ -1014,8 +1014,6 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs*   args,
      UConverterDataLMBCS * extraInfo;
      ulmbcs_byte_t group; 
      UConverter* cnv; 
-      uint16_t mbChar;
-      CompactShortArray *MyCArray;
            
      if (CurByte == ULMBCS_GRP_CTRL)  /* Control character group - no opt group update */
      {
@ -1056,15 +1054,20 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs*   args,
      
         else if (group >= ULMBCS_DOUBLEOPTGROUP_START)    /* double byte conversion */
         {
-            ulmbcs_byte_t  HighCh, LowCh;

            CHECK_SOURCE_LIMIT(2);
-            HighCh = *(args->source)++; 
-            LowCh = *(args->source)++;
-          /* check for LMBCS doubled-group-byte case */
-            mbChar = (HighCh == group) ? LowCh : (HighCh<<8) | LowCh;
-            MyCArray = &cnv->sharedData->table->mbcs.toUnicode;
-            uniChar = (UChar) ucmp16_getu (MyCArray, mbChar);
+
+            /* check for LMBCS doubled-group-byte case */
+            if (*args->source == group) {
+               /* single byte */
+               ++args->source;
+               uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 1);
+            } else {
+               /* double byte */
+               const char *newLimit = args->source + 2;
+               uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, newLimit);
+               args->source = newLimit; /* set the correct limit even in case of an error */
+            }
         }
         else {                                  /* single byte conversion */
            CHECK_SOURCE_LIMIT(1);
@ -1079,13 +1082,17 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs*   args,
            /* The non-optimizable oddballs where there is an explicit byte 
             * AND the second byte is not in the upper ascii range
            */
+               const char *s;
+               char bytes[2];
+
               extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
               cnv = extraInfo->OptGrpConverter [ULMBCS_GRP_EXCEPT];  
            
            /* Lookup value must include opt group */
-               mbChar =  (UChar)(group << 8) | (UChar) CurByte;
-               MyCArray = &cnv->sharedData->table->mbcs.toUnicode;
-               uniChar = (UChar) ucmp16_getu(MyCArray, mbChar);
+               bytes[0] = group;
+               bytes[1] = CurByte;
+               s = bytes;
+               uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &s, bytes + 2);
            }
         }
      }
@ -1096,22 +1103,24 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs*   args,
         cnv = extraInfo->OptGrpConverter[group];
         if (group >= ULMBCS_DOUBLEOPTGROUP_START)    /* double byte conversion */
         {
-            ulmbcs_byte_t  HighCh, LowCh;
-      
-            if (cnv->sharedData->table->mbcs.starters[CurByte] == FALSE)
+            if (!_MBCSIsLeadByte(cnv->sharedData, CurByte))
            {
               CHECK_SOURCE_LIMIT(0);
-               mbChar = CurByte;
+
+               /* let the MBCS conversion consume CurByte again */
+               --args->source;
+               uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 1);
            }
            else
            {
               CHECK_SOURCE_LIMIT(1);
-               HighCh = CurByte;
-               LowCh = *(args->source)++;
-               mbChar = (HighCh<<8) | LowCh;
+
+               /* let the MBCS conversion consume CurByte again */
+               --args->source;
+
+               /* since we know that we start at a lead byte, args->source _will_ be incremented by 2 */
+               uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 2);
            }
-            MyCArray = &cnv->sharedData->table->mbcs.toUnicode;
-            uniChar = (UChar) ucmp16_getu (MyCArray, mbChar);
         }
         else                                   /* single byte conversion */
         {
@ -1119,14 +1128,15 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs*   args,
         }
      }
   }
-   if (uniChar == missingUCharMarker)
+   if ((uniChar - 0xfffd) <= 2) /* 0xfffd<=uniChar<=0xffff, was: uniChar == missingUCharMarker */
   {
-       /*It's is very likely that the ErrorFunctor will write to the
+       /*It is very likely that the ErrorFunctor will write to the
       *internal buffers */

      /* This code needs updating when new error callbacks are installed */

      UChar * pUniChar = (UChar *)&uniChar;
+      *err = U_INVALID_CHAR_FOUND;
      args->target = pUniChar;
      args->targetLimit = pUniChar + 1;
      args->source = saveSource;
--- a/icu4c/source/common/ucnvmbcs.c
+++ b/icu4c/source/common/ucnvmbcs.c
--- a/icu4c/source/common/ucnvmbcs.h
+++ b/icu4c/source/common/ucnvmbcs.h
@ -0,0 +1,118 @@
+/*
+*******************************************************************************
+*
+*   Copyright (C) 2000, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
+*******************************************************************************
+*   file name:  ucnvmbcs.h
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2000jul07
+*   created by: Markus W. Scherer
+*/
+
+#ifndef __UCNVMBCS_H__
+#define __UCNVMBCS_H__
+
+#include "unicode/utypes.h"
+
+/* MBCS converter data and state -------------------------------------------- */
+
+enum {
+    MBCS_STATE_ILLEGAL,
+    MBCS_STATE_CHANGE_ONLY,
+    MBCS_STATE_UNASSIGNED,
+
+    MBCS_STATE_FALLBACK_DIRECT_16,
+    MBCS_STATE_FALLBACK_DIRECT_20,
+
+    MBCS_STATE_VALID_DIRECT_16,
+    MBCS_STATE_VALID_DIRECT_20,
+
+    MBCS_STATE_VALID_16,
+    MBCS_STATE_VALID_16_PAIR
+};
+
+enum {
+    MBCS_OUTPUT_1,
+    MBCS_OUTPUT_2,
+    MBCS_OUTPUT_3,
+    MBCS_OUTPUT_4,
+
+    MBCS_OUTPUT_3_EUC=8,
+    MBCS_OUTPUT_4_EUC
+};
+
+typedef struct {
+    uint32_t offset;
+    UChar32 codePoint;
+} _MBCSToUFallback;
+
+typedef struct UConverterMBCSTable {
+    /* toUnicode */
+    uint8_t countStates;
+    uint32_t countToUFallbacks;
+
+    const int32_t (*stateTable)/*[countStates]*/[256];
+    const uint16_t *unicodeCodeUnits/*[countUnicodeResults]*/;
+    const _MBCSToUFallback *toUFallbacks;
+
+    /* fromUnicode */
+    const uint16_t *fromUnicodeTable;
+    const uint8_t *fromUnicodeBytes;
+    uint8_t outputType;
+} UConverterMBCSTable;
+
+/*
+ * MBCS data structure as part of a .cnv file:
+ *
+ * uint32_t [8]; -- 8 values:
+ *  0   MBCS version in UVersionInfo format (1.0.0.0)
+ *  1   countStates
+ *  2   countToUFallbacks
+ *  3   offsetToUCodeUnits (offsets are counted from the beginning of this header structure)
+ *  4   offsetFromUTable
+ *  5   offsetFromUBytes
+ *  6   flags, bits:
+ *          31.. 8 reserved
+ *           7.. 0 outputType
+ *  7   reserved
+ *
+ * stateTable[countStates][256];
+ *
+ * struct { (fallbacks are sorted by offset)
+ *     uint32_t offset;
+ *     UChar32 codePoint;
+ * } toUFallbacks[countToUFallbacks];
+ *
+ * uint16_t unicodeCodeUnits[?]; (even number of units or padded)
+ *
+ * uint16_t fromUTable[0x440+?]; (32-bit-aligned)
+ *
+ * uint8_t fromUBytes[?];
+ */
+typedef struct {
+    UVersionInfo version;
+    uint32_t countStates,
+             countToUFallbacks,
+             offsetToUCodeUnits,
+             offsetFromUTable,
+             offsetFromUBytes,
+             flags,
+             reserved;
+} _MBCSHeader;
+
+struct UConverterSharedData;
+typedef struct UConverterSharedData UConverterSharedData;
+
+U_CFUNC UChar32
+_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
+                        const char **pSource, const char *sourceLimit);
+
+U_CFUNC UBool
+_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte);
+
+#endif