Finish UTF16 conversion support

This commit is contained in:
James Clark 1998-05-31 08:06:01 +00:00
parent 5b77f426ee
commit a2625a0572
3 changed files with 131 additions and 62 deletions

View file

@ -25,19 +25,23 @@ Contributor(s):
#include "xmldef.h"
#ifdef XML_UNICODE
#define XML_ENCODE_MAX XML_UTF16_ENCODE_MAX
#define XmlConvert XmlUtf16Convert
#define XmlGetInternalEncoding XmlGetUtf16InternalEncoding
#define XmlEncode XmlUtf16Encode
#define xmlstrchr wcschr
#define xmlstrcmp wcscmp
#define XML_T(x) L ## x
typedef unsigned short ICHAR;
#else
#define XML_ENCODE_MAX XML_UTF8_ENCODE_MAX
#define XmlConvert XmlUtf8Convert
#define XmlGetInternalEncoding XmlGetUtf8InternalEncoding
#define XmlEncode XmlUtf8Encode
#define xmlstrchr strchr
#define xmlstrcmp strcmp
#define XML_T(x) x
typedef char ICHAR;
#endif
/* Round up n to be a multiple of sz, where sz is a power of 2. */
@ -852,18 +856,19 @@ doContent(XML_Parser parser,
++tagLevel;
if (startElementHandler) {
enum XML_Error result;
char *toPtr;
XML_Char *toPtr;
for (;;) {
const char *rawNameEnd = tag->rawName + tag->rawNameLength;
const char *fromPtr = tag->rawName;
int bufSize;
toPtr = tag->buf;
if (nextPtr)
toPtr += ROUND_UP(tag->rawNameLength, sizeof(XML_Char));
tag->name = (XML_Char *)toPtr;
toPtr = (XML_Char *)(tag->buf + ROUND_UP(tag->rawNameLength, sizeof(XML_Char)));
else
toPtr = (XML_Char *)tag->buf;
tag->name = toPtr;
XmlConvert(enc,
&fromPtr, rawNameEnd,
&toPtr, tag->bufEnd - sizeof(XML_Char));
(ICHAR **)&toPtr, (ICHAR *)tag->bufEnd - 1);
if (fromPtr == rawNameEnd)
break;
bufSize = (tag->bufEnd - tag->buf) << 1;
@ -874,7 +879,7 @@ doContent(XML_Parser parser,
if (nextPtr)
tag->rawName = tag->buf;
}
*(XML_Char *)toPtr = XML_T('\0');
*toPtr = XML_T('\0');
result = storeAtts(parser, enc, tag->name, s);
if (result)
return result;
@ -958,8 +963,8 @@ doContent(XML_Parser parser,
return XML_ERROR_BAD_CHAR_REF;
}
if (characterDataHandler) {
XML_Char buf[XML_MAX_BYTES_PER_CHAR];
characterDataHandler(userData, buf, XmlEncode(n, (char *)buf)/sizeof(XML_Char));
XML_Char buf[XML_ENCODE_MAX];
characterDataHandler(userData, buf, XmlEncode(n, (ICHAR *)buf));
}
}
break;
@ -987,9 +992,9 @@ doContent(XML_Parser parser,
return XML_ERROR_NONE;
}
if (characterDataHandler) {
char *dataPtr = (char *)dataBuf;
XmlConvert(enc, &s, end, &dataPtr, (char *)dataBufEnd);
characterDataHandler(userData, dataBuf, (XML_Char *)dataPtr - dataBuf);
ICHAR *dataPtr = (ICHAR *)dataBuf;
XmlConvert(enc, &s, end, &dataPtr, (ICHAR *)dataBufEnd);
characterDataHandler(userData, dataBuf, dataPtr - (ICHAR *)dataBuf);
}
if (startTagLevel == 0) {
errorPtr = end;
@ -1003,9 +1008,9 @@ doContent(XML_Parser parser,
case XML_TOK_DATA_CHARS:
if (characterDataHandler) {
do {
char *dataPtr = (char *)dataBuf;
XmlConvert(enc, &s, next, &dataPtr, (char *)dataBufEnd);
characterDataHandler(userData, dataBuf, (XML_Char *)dataPtr - dataBuf);
ICHAR *dataPtr = (ICHAR *)dataBuf;
XmlConvert(enc, &s, next, &dataPtr, (ICHAR *)dataBufEnd);
characterDataHandler(userData, dataBuf, dataPtr - (ICHAR *)dataBuf);
} while (s != next);
}
break;
@ -1155,9 +1160,9 @@ enum XML_Error doCdataSection(XML_Parser parser,
case XML_TOK_DATA_CHARS:
if (characterDataHandler) {
do {
char *dataPtr = (char *)dataBuf;
XmlConvert(encoding, &s, next, &dataPtr, (char *)dataBufEnd);
characterDataHandler(userData, dataBuf, (XML_Char *)dataPtr - dataBuf);
ICHAR *dataPtr = (ICHAR *)dataBuf;
XmlConvert(encoding, &s, next, &dataPtr, (ICHAR *)dataBufEnd);
characterDataHandler(userData, dataBuf, dataPtr - (ICHAR *)dataBuf);
} while (s != next);
}
break;
@ -1521,7 +1526,7 @@ appendAttributeValue(XML_Parser parser, const ENCODING *enc, int isCdata,
return XML_ERROR_INVALID_TOKEN;
case XML_TOK_CHAR_REF:
{
XML_Char buf[XML_MAX_BYTES_PER_CHAR];
XML_Char buf[XML_ENCODE_MAX];
int i;
int n = XmlCharRefNumber(enc, ptr);
if (n < 0) {
@ -1532,7 +1537,7 @@ appendAttributeValue(XML_Parser parser, const ENCODING *enc, int isCdata,
&& n == 0x20 /* space */
&& (poolLength(pool) == 0 || poolLastChar(pool) == XML_T(' ')))
break;
n = XmlEncode(n, (char *)buf)/sizeof(XML_Char);
n = XmlEncode(n, (ICHAR *)buf);
if (!n) {
errorPtr = ptr;
return XML_ERROR_BAD_CHAR_REF;
@ -1653,14 +1658,14 @@ enum XML_Error storeEntityValue(XML_Parser parser,
break;
case XML_TOK_CHAR_REF:
{
XML_Char buf[XML_MAX_BYTES_PER_CHAR];
XML_Char buf[XML_ENCODE_MAX];
int i;
int n = XmlCharRefNumber(encoding, entityTextPtr);
if (n < 0) {
errorPtr = entityTextPtr;
return XML_ERROR_BAD_CHAR_REF;
}
n = XmlEncode(n, (char *)buf)/sizeof(XML_Char);
n = XmlEncode(n, (ICHAR *)buf);
if (!n) {
errorPtr = entityTextPtr;
return XML_ERROR_BAD_CHAR_REF;
@ -2068,9 +2073,7 @@ XML_Char *poolAppend(STRING_POOL *pool, const ENCODING *enc,
if (!pool->ptr && !poolGrow(pool))
return 0;
for (;;) {
/* The cast to (char **) won't work on machines with different
representations for char * and wchar_t *. */
XmlConvert(enc, &ptr, end, (char **)&(pool->ptr), (char *)pool->end);
XmlConvert(enc, &ptr, end, (ICHAR **)&(pool->ptr), (ICHAR *)pool->end);
if (ptr == end)
break;
if (!poolGrow(pool))

View file

@ -141,9 +141,40 @@ void utf8_toUtf8(const ENCODING *enc,
static
void utf8_toUtf16(const ENCODING *enc,
const char **fromP, const char *fromLim,
char **toP, const char *toLim)
unsigned short **toP, const unsigned short *toLim)
{
/* FIXME */
unsigned short *to = *toP;
const char *from = *fromP;
while (from != fromLim && to != toLim) {
switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
case BT_LEAD2:
*to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
from += 2;
break;
case BT_LEAD3:
*to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
from += 3;
break;
case BT_LEAD4:
{
unsigned long n;
if (to + 1 == toLim)
break;
n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
n -= 0x10000;
to[0] = (unsigned short)((n >> 10) | 0xD800);
to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
to += 2;
from += 4;
}
break;
default:
*to++ = *from++;
break;
}
}
*fromP = from;
*toP = to;
}
static const struct normal_encoding utf8_encoding = {
@ -190,8 +221,10 @@ void latin1_toUtf8(const ENCODING *enc,
static
void latin1_toUtf16(const ENCODING *enc,
const char **fromP, const char *fromLim,
char **toP, const char *toLim)
unsigned short **toP, const unsigned short *toLim)
{
while (*fromP != fromLim && *toP != toLim)
*(*toP)++ = (unsigned char)*(*fromP)++;
}
static const struct normal_encoding latin1_encoding = {
@ -202,8 +235,6 @@ static const struct normal_encoding latin1_encoding = {
}
};
#define latin1tab (latin1_encoding.type)
static
void ascii_toUtf8(const ENCODING *enc,
const char **fromP, const char *fromLim,
@ -213,16 +244,8 @@ void ascii_toUtf8(const ENCODING *enc,
*(*toP)++ = *(*fromP)++;
}
static
void ascii_toUtf16(const ENCODING *enc,
const char **fromP, const char *fromLim,
char **toP, const char *toLim)
{
/* FIXME */
}
static const struct normal_encoding ascii_encoding = {
{ VTABLE1, ascii_toUtf8, ascii_toUtf16, 1 },
{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1 },
{
#include "asciitab.h"
/* BT_NONXML == 0 */
@ -316,15 +339,22 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
static \
void PREFIX(toUtf16)(const ENCODING *enc, \
const char **fromP, const char *fromLim, \
char **toP, const char *toLim) \
unsigned short **toP, const unsigned short *toLim) \
{ \
/* FIXME */ \
/* Avoid copying first half only of surrogate */ \
if (fromLim - *fromP > ((toLim - *toP) << 1) \
&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
fromLim -= 2; \
for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
*(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
}
#define PREFIX(ident) little2_ ## ident
#define MINBPC 2
#define BYTE_TYPE(enc, p) \
((p)[1] == 0 ? latin1tab[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0]))
((p)[1] == 0 \
? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
: unicode_byte_type((p)[1], (p)[0]))
#define BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
#define CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
#define IS_NAME_CHAR(enc, p, n) \
@ -353,7 +383,21 @@ DEFINE_UTF16_TO_UTF16
#undef IS_NMSTRT_CHAR
#undef IS_INVALID_CHAR
static const struct encoding little2_encoding = { VTABLE, 2 };
static const struct normal_encoding little2_encoding = {
{ VTABLE, 2 },
#include "asciitab.h"
#include "latin1tab.h"
};
#if BYTE_ORDER != 21
static const struct normal_encoding internal_little2_encoding = {
{ VTABLE, 2 },
#include "iasciitab.h"
#include "latin1tab.h"
};
#endif
#undef PREFIX
@ -361,7 +405,9 @@ static const struct encoding little2_encoding = { VTABLE, 2 };
#define MINBPC 2
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
#define BYTE_TYPE(enc, p) \
((p)[0] == 0 ? latin1tab[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1]))
((p)[0] == 0 \
? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
: unicode_byte_type((p)[0], (p)[1]))
#define BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
#define CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
#define IS_NAME_CHAR(enc, p, n) \
@ -390,7 +436,21 @@ DEFINE_UTF16_TO_UTF16
#undef IS_NMSTRT_CHAR
#undef IS_INVALID_CHAR
static const struct encoding big2_encoding = { VTABLE, 2 };
static const struct normal_encoding big2_encoding = {
{ VTABLE, 2 },
#include "asciitab.h"
#include "latin1tab.h"
};
#if BYTE_ORDER != 12
static const struct normal_encoding internal_big2_encoding = {
{ VTABLE, 2 },
#include "iasciitab.h"
#include "latin1tab.h"
};
#endif
#undef PREFIX
@ -433,18 +493,18 @@ int initScan(const ENCODING *enc, int state, const char *ptr, const char *end,
else {
switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
case 0x003C:
*encPtr = &big2_encoding;
*encPtr = &big2_encoding.enc;
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
case 0xFEFF:
*nextTokPtr = ptr + 2;
*encPtr = &big2_encoding;
*encPtr = &big2_encoding.enc;
return XML_TOK_BOM;
case 0x3C00:
*encPtr = &little2_encoding;
*encPtr = &little2_encoding.enc;
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
case 0xFFFE:
*nextTokPtr = ptr + 2;
*encPtr = &little2_encoding;
*encPtr = &little2_encoding.enc;
return XML_TOK_BOM;
}
}
@ -480,8 +540,14 @@ const ENCODING *XmlGetUtf8InternalEncoding()
const ENCODING *XmlGetUtf16InternalEncoding()
{
/* FIXME */
return 0;
#if BYTE_ORDER == 12
return &internal_little2_encoding.enc;
#elif BYTE_ORDER == 21
return &internal_big2_encoding.enc;
#else
const short n = 1;
return *(const char *)&n ? &internal_little2_encoding.enc : &internal_big2_encoding.enc;
#endif
}
int XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr, const char *name)
@ -644,7 +710,7 @@ const ENCODING *findEncoding(const ENCODING *enc, const char *ptr, const char *e
static const unsigned short n = 1;
if (enc->minBytesPerChar == 2)
return enc;
return &big2_encoding;
return &big2_encoding.enc;
}
return 0;
}
@ -780,21 +846,19 @@ int XmlUtf8Encode(int c, char *buf)
return 0;
}
int XmlUtf16Encode(int charNum, char *buf)
int XmlUtf16Encode(int charNum, unsigned short *buf)
{
#if 0
if (charNum < 0)
return 0;
if (charNum < 0x10000) {
UTF16_SET(buf[0], charNum);
buf[0] = charNum;
return 1;
}
if (charNum < 0x110000) {
charNum -= 0x10000;
UTF16_SET(buf[0], (charNum >> 10) + 0xD800);
UTF16_SET(buf[1] = (charNum & 0x3FF) + 0xDC00);
buf[0] = (charNum >> 10) + 0xD800;
buf[1] = (charNum & 0x3FF) + 0xDC00;
return 2;
}
#endif
return 0;
}

View file

@ -103,8 +103,10 @@ extern "C" {
#define XML_ATTRIBUTE_VALUE_LITERAL 0
#define XML_ENTITY_VALUE_LITERAL 1
/* The size of the buffer passed to XmlUtf8Encode and XmlUtf16Encode must be at least this. */
#define XML_MAX_BYTES_PER_CHAR 4
/* The size of the buffer passed to XmlUtf8Encode must be at least this. */
#define XML_UTF8_ENCODE_MAX 4
/* The size of the buffer passed to XmlUtf16Encode must be at least this. */
#define XML_UTF16_ENCODE_MAX 2
typedef struct position {
/* first line and first column are 0 not 1 */
@ -154,8 +156,8 @@ struct encoding {
void (*utf16Convert)(const ENCODING *enc,
const char **fromP,
const char *fromLim,
char **toP,
const char *toLim);
unsigned short **toP,
const unsigned short *toLim);
int minBytesPerChar;
};
@ -252,7 +254,7 @@ int XMLTOKAPI XmlInitEncoding(INIT_ENCODING *, const ENCODING **, const char *na
const ENCODING XMLTOKAPI *XmlGetUtf8InternalEncoding();
const ENCODING XMLTOKAPI *XmlGetUtf16InternalEncoding();
int XMLTOKAPI XmlUtf8Encode(int charNumber, char *buf);
int XMLTOKAPI XmlUtf16Encode(int charNumber, char *buf);
int XMLTOKAPI XmlUtf16Encode(int charNumber, unsigned short *buf);
#ifdef __cplusplus
}