mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-103 replace the mbcs implementation by one that supports up to 4 bytes/char and full utf-16
X-SVN-Rev: 1830
This commit is contained in:
parent
8401b1c498
commit
f0b6b788f2
4 changed files with 1447 additions and 763 deletions
|
@ -21,6 +21,7 @@
|
|||
#include "unicode/utypes.h"
|
||||
#include "unicode/ucnv_err.h"
|
||||
#include "ucnv_bld.h"
|
||||
#include "ucnvmbcs.h"
|
||||
#include "ucmp8.h"
|
||||
#include "ucmp16.h"
|
||||
|
||||
|
@ -43,16 +44,6 @@ typedef struct
|
|||
}
|
||||
UConverterDBCSTable;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
UBool *starters; /* [256]; */
|
||||
CompactShortArray toUnicode;
|
||||
CompactShortArray fromUnicode;
|
||||
CompactShortArray toUnicodeFallback;
|
||||
CompactShortArray fromUnicodeFallback;
|
||||
}
|
||||
UConverterMBCSTable;
|
||||
|
||||
union UConverterTable
|
||||
{
|
||||
UConverterSBCSTable sbcs;
|
||||
|
@ -141,8 +132,7 @@ U_CDECL_BEGIN
|
|||
args->offsets = saveOffsets; \
|
||||
for (;My_i < myTargetIndex;My_i++) {args->offsets[My_i] += currentOffset ; } \
|
||||
}
|
||||
/*
|
||||
*/
|
||||
|
||||
|
||||
typedef void (*UConverterLoad) (UConverterSharedData *sharedData, const uint8_t *raw, UErrorCode *pErrorCode);
|
||||
typedef void (*UConverterUnload) (UConverterSharedData *sharedData);
|
||||
|
|
|
@ -1014,8 +1014,6 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
|
|||
UConverterDataLMBCS * extraInfo;
|
||||
ulmbcs_byte_t group;
|
||||
UConverter* cnv;
|
||||
uint16_t mbChar;
|
||||
CompactShortArray *MyCArray;
|
||||
|
||||
if (CurByte == ULMBCS_GRP_CTRL) /* Control character group - no opt group update */
|
||||
{
|
||||
|
@ -1056,15 +1054,20 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
|
|||
|
||||
else if (group >= ULMBCS_DOUBLEOPTGROUP_START) /* double byte conversion */
|
||||
{
|
||||
ulmbcs_byte_t HighCh, LowCh;
|
||||
|
||||
CHECK_SOURCE_LIMIT(2);
|
||||
HighCh = *(args->source)++;
|
||||
LowCh = *(args->source)++;
|
||||
/* check for LMBCS doubled-group-byte case */
|
||||
mbChar = (HighCh == group) ? LowCh : (HighCh<<8) | LowCh;
|
||||
MyCArray = &cnv->sharedData->table->mbcs.toUnicode;
|
||||
uniChar = (UChar) ucmp16_getu (MyCArray, mbChar);
|
||||
|
||||
/* check for LMBCS doubled-group-byte case */
|
||||
if (*args->source == group) {
|
||||
/* single byte */
|
||||
++args->source;
|
||||
uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 1);
|
||||
} else {
|
||||
/* double byte */
|
||||
const char *newLimit = args->source + 2;
|
||||
uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, newLimit);
|
||||
args->source = newLimit; /* set the correct limit even in case of an error */
|
||||
}
|
||||
}
|
||||
else { /* single byte conversion */
|
||||
CHECK_SOURCE_LIMIT(1);
|
||||
|
@ -1079,13 +1082,17 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
|
|||
/* The non-optimizable oddballs where there is an explicit byte
|
||||
* AND the second byte is not in the upper ascii range
|
||||
*/
|
||||
const char *s;
|
||||
char bytes[2];
|
||||
|
||||
extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
|
||||
cnv = extraInfo->OptGrpConverter [ULMBCS_GRP_EXCEPT];
|
||||
|
||||
/* Lookup value must include opt group */
|
||||
mbChar = (UChar)(group << 8) | (UChar) CurByte;
|
||||
MyCArray = &cnv->sharedData->table->mbcs.toUnicode;
|
||||
uniChar = (UChar) ucmp16_getu(MyCArray, mbChar);
|
||||
bytes[0] = group;
|
||||
bytes[1] = CurByte;
|
||||
s = bytes;
|
||||
uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &s, bytes + 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1096,22 +1103,24 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
|
|||
cnv = extraInfo->OptGrpConverter[group];
|
||||
if (group >= ULMBCS_DOUBLEOPTGROUP_START) /* double byte conversion */
|
||||
{
|
||||
ulmbcs_byte_t HighCh, LowCh;
|
||||
|
||||
if (cnv->sharedData->table->mbcs.starters[CurByte] == FALSE)
|
||||
if (!_MBCSIsLeadByte(cnv->sharedData, CurByte))
|
||||
{
|
||||
CHECK_SOURCE_LIMIT(0);
|
||||
mbChar = CurByte;
|
||||
|
||||
/* let the MBCS conversion consume CurByte again */
|
||||
--args->source;
|
||||
uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
CHECK_SOURCE_LIMIT(1);
|
||||
HighCh = CurByte;
|
||||
LowCh = *(args->source)++;
|
||||
mbChar = (HighCh<<8) | LowCh;
|
||||
|
||||
/* let the MBCS conversion consume CurByte again */
|
||||
--args->source;
|
||||
|
||||
/* since we know that we start at a lead byte, args->source _will_ be incremented by 2 */
|
||||
uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 2);
|
||||
}
|
||||
MyCArray = &cnv->sharedData->table->mbcs.toUnicode;
|
||||
uniChar = (UChar) ucmp16_getu (MyCArray, mbChar);
|
||||
}
|
||||
else /* single byte conversion */
|
||||
{
|
||||
|
@ -1119,14 +1128,15 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
|
|||
}
|
||||
}
|
||||
}
|
||||
if (uniChar == missingUCharMarker)
|
||||
if ((uniChar - 0xfffd) <= 2) /* 0xfffd<=uniChar<=0xffff, was: uniChar == missingUCharMarker */
|
||||
{
|
||||
/*It's is very likely that the ErrorFunctor will write to the
|
||||
/*It is very likely that the ErrorFunctor will write to the
|
||||
*internal buffers */
|
||||
|
||||
/* This code needs updating when new error callbacks are installed */
|
||||
|
||||
UChar * pUniChar = (UChar *)&uniChar;
|
||||
*err = U_INVALID_CHAR_FOUND;
|
||||
args->target = pUniChar;
|
||||
args->targetLimit = pUniChar + 1;
|
||||
args->source = saveSource;
|
||||
|
|
File diff suppressed because it is too large
Load diff
118
icu4c/source/common/ucnvmbcs.h
Normal file
118
icu4c/source/common/ucnvmbcs.h
Normal file
|
@ -0,0 +1,118 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: ucnvmbcs.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2000jul07
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __UCNVMBCS_H__
|
||||
#define __UCNVMBCS_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
/* MBCS converter data and state -------------------------------------------- */
|
||||
|
||||
enum {
|
||||
MBCS_STATE_ILLEGAL,
|
||||
MBCS_STATE_CHANGE_ONLY,
|
||||
MBCS_STATE_UNASSIGNED,
|
||||
|
||||
MBCS_STATE_FALLBACK_DIRECT_16,
|
||||
MBCS_STATE_FALLBACK_DIRECT_20,
|
||||
|
||||
MBCS_STATE_VALID_DIRECT_16,
|
||||
MBCS_STATE_VALID_DIRECT_20,
|
||||
|
||||
MBCS_STATE_VALID_16,
|
||||
MBCS_STATE_VALID_16_PAIR
|
||||
};
|
||||
|
||||
enum {
|
||||
MBCS_OUTPUT_1,
|
||||
MBCS_OUTPUT_2,
|
||||
MBCS_OUTPUT_3,
|
||||
MBCS_OUTPUT_4,
|
||||
|
||||
MBCS_OUTPUT_3_EUC=8,
|
||||
MBCS_OUTPUT_4_EUC
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
uint32_t offset;
|
||||
UChar32 codePoint;
|
||||
} _MBCSToUFallback;
|
||||
|
||||
typedef struct UConverterMBCSTable {
|
||||
/* toUnicode */
|
||||
uint8_t countStates;
|
||||
uint32_t countToUFallbacks;
|
||||
|
||||
const int32_t (*stateTable)/*[countStates]*/[256];
|
||||
const uint16_t *unicodeCodeUnits/*[countUnicodeResults]*/;
|
||||
const _MBCSToUFallback *toUFallbacks;
|
||||
|
||||
/* fromUnicode */
|
||||
const uint16_t *fromUnicodeTable;
|
||||
const uint8_t *fromUnicodeBytes;
|
||||
uint8_t outputType;
|
||||
} UConverterMBCSTable;
|
||||
|
||||
/*
|
||||
* MBCS data structure as part of a .cnv file:
|
||||
*
|
||||
* uint32_t [8]; -- 8 values:
|
||||
* 0 MBCS version in UVersionInfo format (1.0.0.0)
|
||||
* 1 countStates
|
||||
* 2 countToUFallbacks
|
||||
* 3 offsetToUCodeUnits (offsets are counted from the beginning of this header structure)
|
||||
* 4 offsetFromUTable
|
||||
* 5 offsetFromUBytes
|
||||
* 6 flags, bits:
|
||||
* 31.. 8 reserved
|
||||
* 7.. 0 outputType
|
||||
* 7 reserved
|
||||
*
|
||||
* stateTable[countStates][256];
|
||||
*
|
||||
* struct { (fallbacks are sorted by offset)
|
||||
* uint32_t offset;
|
||||
* UChar32 codePoint;
|
||||
* } toUFallbacks[countToUFallbacks];
|
||||
*
|
||||
* uint16_t unicodeCodeUnits[?]; (even number of units or padded)
|
||||
*
|
||||
* uint16_t fromUTable[0x440+?]; (32-bit-aligned)
|
||||
*
|
||||
* uint8_t fromUBytes[?];
|
||||
*/
|
||||
typedef struct {
|
||||
UVersionInfo version;
|
||||
uint32_t countStates,
|
||||
countToUFallbacks,
|
||||
offsetToUCodeUnits,
|
||||
offsetFromUTable,
|
||||
offsetFromUBytes,
|
||||
flags,
|
||||
reserved;
|
||||
} _MBCSHeader;
|
||||
|
||||
struct UConverterSharedData;
|
||||
typedef struct UConverterSharedData UConverterSharedData;
|
||||
|
||||
U_CFUNC UChar32
|
||||
_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
|
||||
const char **pSource, const char *sourceLimit);
|
||||
|
||||
U_CFUNC UBool
|
||||
_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte);
|
||||
|
||||
#endif
|
Loading…
Add table
Reference in a new issue