ICU-103 replace the mbcs implementation by one that supports up to 4 bytes/char and full utf-16

X-SVN-Rev: 1830
This commit is contained in:
Markus Scherer 2000-07-13 00:10:29 +00:00
parent 8401b1c498
commit f0b6b788f2
4 changed files with 1447 additions and 763 deletions

View file

@ -21,6 +21,7 @@
#include "unicode/utypes.h"
#include "unicode/ucnv_err.h"
#include "ucnv_bld.h"
#include "ucnvmbcs.h"
#include "ucmp8.h"
#include "ucmp16.h"
@ -43,16 +44,6 @@ typedef struct
}
UConverterDBCSTable;
typedef struct
{
UBool *starters; /* [256]; */
CompactShortArray toUnicode;
CompactShortArray fromUnicode;
CompactShortArray toUnicodeFallback;
CompactShortArray fromUnicodeFallback;
}
UConverterMBCSTable;
union UConverterTable
{
UConverterSBCSTable sbcs;
@ -141,8 +132,7 @@ U_CDECL_BEGIN
args->offsets = saveOffsets; \
for (;My_i < myTargetIndex;My_i++) {args->offsets[My_i] += currentOffset ; } \
}
/*
*/
typedef void (*UConverterLoad) (UConverterSharedData *sharedData, const uint8_t *raw, UErrorCode *pErrorCode);
typedef void (*UConverterUnload) (UConverterSharedData *sharedData);

View file

@ -1014,8 +1014,6 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
UConverterDataLMBCS * extraInfo;
ulmbcs_byte_t group;
UConverter* cnv;
uint16_t mbChar;
CompactShortArray *MyCArray;
if (CurByte == ULMBCS_GRP_CTRL) /* Control character group - no opt group update */
{
@ -1056,15 +1054,20 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
else if (group >= ULMBCS_DOUBLEOPTGROUP_START) /* double byte conversion */
{
ulmbcs_byte_t HighCh, LowCh;
CHECK_SOURCE_LIMIT(2);
HighCh = *(args->source)++;
LowCh = *(args->source)++;
/* check for LMBCS doubled-group-byte case */
mbChar = (HighCh == group) ? LowCh : (HighCh<<8) | LowCh;
MyCArray = &cnv->sharedData->table->mbcs.toUnicode;
uniChar = (UChar) ucmp16_getu (MyCArray, mbChar);
/* check for LMBCS doubled-group-byte case */
if (*args->source == group) {
/* single byte */
++args->source;
uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 1);
} else {
/* double byte */
const char *newLimit = args->source + 2;
uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, newLimit);
args->source = newLimit; /* set the correct limit even in case of an error */
}
}
else { /* single byte conversion */
CHECK_SOURCE_LIMIT(1);
@ -1079,13 +1082,17 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
/* The non-optimizable oddballs where there is an explicit byte
* AND the second byte is not in the upper ascii range
*/
const char *s;
char bytes[2];
extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
cnv = extraInfo->OptGrpConverter [ULMBCS_GRP_EXCEPT];
/* Lookup value must include opt group */
mbChar = (UChar)(group << 8) | (UChar) CurByte;
MyCArray = &cnv->sharedData->table->mbcs.toUnicode;
uniChar = (UChar) ucmp16_getu(MyCArray, mbChar);
bytes[0] = group;
bytes[1] = CurByte;
s = bytes;
uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &s, bytes + 2);
}
}
}
@ -1096,22 +1103,24 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
cnv = extraInfo->OptGrpConverter[group];
if (group >= ULMBCS_DOUBLEOPTGROUP_START) /* double byte conversion */
{
ulmbcs_byte_t HighCh, LowCh;
if (cnv->sharedData->table->mbcs.starters[CurByte] == FALSE)
if (!_MBCSIsLeadByte(cnv->sharedData, CurByte))
{
CHECK_SOURCE_LIMIT(0);
mbChar = CurByte;
/* let the MBCS conversion consume CurByte again */
--args->source;
uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 1);
}
else
{
CHECK_SOURCE_LIMIT(1);
HighCh = CurByte;
LowCh = *(args->source)++;
mbChar = (HighCh<<8) | LowCh;
/* let the MBCS conversion consume CurByte again */
--args->source;
/* since we know that we start at a lead byte, args->source _will_ be incremented by 2 */
uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 2);
}
MyCArray = &cnv->sharedData->table->mbcs.toUnicode;
uniChar = (UChar) ucmp16_getu (MyCArray, mbChar);
}
else /* single byte conversion */
{
@ -1119,14 +1128,15 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
}
}
}
if (uniChar == missingUCharMarker)
if ((uniChar - 0xfffd) <= 2) /* 0xfffd<=uniChar<=0xffff, was: uniChar == missingUCharMarker */
{
/*It's is very likely that the ErrorFunctor will write to the
/*It is very likely that the ErrorFunctor will write to the
*internal buffers */
/* This code needs updating when new error callbacks are installed */
UChar * pUniChar = (UChar *)&uniChar;
*err = U_INVALID_CHAR_FOUND;
args->target = pUniChar;
args->targetLimit = pUniChar + 1;
args->source = saveSource;

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,118 @@
/*
*******************************************************************************
*
* Copyright (C) 2000, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: ucnvmbcs.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2000jul07
* created by: Markus W. Scherer
*/
#ifndef __UCNVMBCS_H__
#define __UCNVMBCS_H__
#include "unicode/utypes.h"
/* MBCS converter data and state -------------------------------------------- */
enum {
MBCS_STATE_ILLEGAL,
MBCS_STATE_CHANGE_ONLY,
MBCS_STATE_UNASSIGNED,
MBCS_STATE_FALLBACK_DIRECT_16,
MBCS_STATE_FALLBACK_DIRECT_20,
MBCS_STATE_VALID_DIRECT_16,
MBCS_STATE_VALID_DIRECT_20,
MBCS_STATE_VALID_16,
MBCS_STATE_VALID_16_PAIR
};
enum {
MBCS_OUTPUT_1,
MBCS_OUTPUT_2,
MBCS_OUTPUT_3,
MBCS_OUTPUT_4,
MBCS_OUTPUT_3_EUC=8,
MBCS_OUTPUT_4_EUC
};
typedef struct {
uint32_t offset;
UChar32 codePoint;
} _MBCSToUFallback;
typedef struct UConverterMBCSTable {
/* toUnicode */
uint8_t countStates;
uint32_t countToUFallbacks;
const int32_t (*stateTable)/*[countStates]*/[256];
const uint16_t *unicodeCodeUnits/*[countUnicodeResults]*/;
const _MBCSToUFallback *toUFallbacks;
/* fromUnicode */
const uint16_t *fromUnicodeTable;
const uint8_t *fromUnicodeBytes;
uint8_t outputType;
} UConverterMBCSTable;
/*
* MBCS data structure as part of a .cnv file:
*
* uint32_t [8]; -- 8 values:
* 0 MBCS version in UVersionInfo format (1.0.0.0)
* 1 countStates
* 2 countToUFallbacks
* 3 offsetToUCodeUnits (offsets are counted from the beginning of this header structure)
* 4 offsetFromUTable
* 5 offsetFromUBytes
* 6 flags, bits:
* 31.. 8 reserved
* 7.. 0 outputType
* 7 reserved
*
* stateTable[countStates][256];
*
* struct { (fallbacks are sorted by offset)
* uint32_t offset;
* UChar32 codePoint;
* } toUFallbacks[countToUFallbacks];
*
* uint16_t unicodeCodeUnits[?]; (even number of units or padded)
*
* uint16_t fromUTable[0x440+?]; (32-bit-aligned)
*
* uint8_t fromUBytes[?];
*/
typedef struct {
UVersionInfo version;
uint32_t countStates,
countToUFallbacks,
offsetToUCodeUnits,
offsetFromUTable,
offsetFromUBytes,
flags,
reserved;
} _MBCSHeader;
struct UConverterSharedData;
typedef struct UConverterSharedData UConverterSharedData;
U_CFUNC UChar32
_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
const char **pSource, const char *sourceLimit);
U_CFUNC UBool
_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte);
#endif