mirror of
https://github.com/libexpat/libexpat.git
synced 2025-04-09 06:40:13 +00:00
Finish UTF16 conversion support
This commit is contained in:
parent
5b77f426ee
commit
a2625a0572
3 changed files with 131 additions and 62 deletions
|
@ -25,19 +25,23 @@ Contributor(s):
|
|||
#include "xmldef.h"
|
||||
|
||||
#ifdef XML_UNICODE
|
||||
#define XML_ENCODE_MAX XML_UTF16_ENCODE_MAX
|
||||
#define XmlConvert XmlUtf16Convert
|
||||
#define XmlGetInternalEncoding XmlGetUtf16InternalEncoding
|
||||
#define XmlEncode XmlUtf16Encode
|
||||
#define xmlstrchr wcschr
|
||||
#define xmlstrcmp wcscmp
|
||||
#define XML_T(x) L ## x
|
||||
typedef unsigned short ICHAR;
|
||||
#else
|
||||
#define XML_ENCODE_MAX XML_UTF8_ENCODE_MAX
|
||||
#define XmlConvert XmlUtf8Convert
|
||||
#define XmlGetInternalEncoding XmlGetUtf8InternalEncoding
|
||||
#define XmlEncode XmlUtf8Encode
|
||||
#define xmlstrchr strchr
|
||||
#define xmlstrcmp strcmp
|
||||
#define XML_T(x) x
|
||||
typedef char ICHAR;
|
||||
#endif
|
||||
|
||||
/* Round up n to be a multiple of sz, where sz is a power of 2. */
|
||||
|
@ -852,18 +856,19 @@ doContent(XML_Parser parser,
|
|||
++tagLevel;
|
||||
if (startElementHandler) {
|
||||
enum XML_Error result;
|
||||
char *toPtr;
|
||||
XML_Char *toPtr;
|
||||
for (;;) {
|
||||
const char *rawNameEnd = tag->rawName + tag->rawNameLength;
|
||||
const char *fromPtr = tag->rawName;
|
||||
int bufSize;
|
||||
toPtr = tag->buf;
|
||||
if (nextPtr)
|
||||
toPtr += ROUND_UP(tag->rawNameLength, sizeof(XML_Char));
|
||||
tag->name = (XML_Char *)toPtr;
|
||||
toPtr = (XML_Char *)(tag->buf + ROUND_UP(tag->rawNameLength, sizeof(XML_Char)));
|
||||
else
|
||||
toPtr = (XML_Char *)tag->buf;
|
||||
tag->name = toPtr;
|
||||
XmlConvert(enc,
|
||||
&fromPtr, rawNameEnd,
|
||||
&toPtr, tag->bufEnd - sizeof(XML_Char));
|
||||
(ICHAR **)&toPtr, (ICHAR *)tag->bufEnd - 1);
|
||||
if (fromPtr == rawNameEnd)
|
||||
break;
|
||||
bufSize = (tag->bufEnd - tag->buf) << 1;
|
||||
|
@ -874,7 +879,7 @@ doContent(XML_Parser parser,
|
|||
if (nextPtr)
|
||||
tag->rawName = tag->buf;
|
||||
}
|
||||
*(XML_Char *)toPtr = XML_T('\0');
|
||||
*toPtr = XML_T('\0');
|
||||
result = storeAtts(parser, enc, tag->name, s);
|
||||
if (result)
|
||||
return result;
|
||||
|
@ -958,8 +963,8 @@ doContent(XML_Parser parser,
|
|||
return XML_ERROR_BAD_CHAR_REF;
|
||||
}
|
||||
if (characterDataHandler) {
|
||||
XML_Char buf[XML_MAX_BYTES_PER_CHAR];
|
||||
characterDataHandler(userData, buf, XmlEncode(n, (char *)buf)/sizeof(XML_Char));
|
||||
XML_Char buf[XML_ENCODE_MAX];
|
||||
characterDataHandler(userData, buf, XmlEncode(n, (ICHAR *)buf));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -987,9 +992,9 @@ doContent(XML_Parser parser,
|
|||
return XML_ERROR_NONE;
|
||||
}
|
||||
if (characterDataHandler) {
|
||||
char *dataPtr = (char *)dataBuf;
|
||||
XmlConvert(enc, &s, end, &dataPtr, (char *)dataBufEnd);
|
||||
characterDataHandler(userData, dataBuf, (XML_Char *)dataPtr - dataBuf);
|
||||
ICHAR *dataPtr = (ICHAR *)dataBuf;
|
||||
XmlConvert(enc, &s, end, &dataPtr, (ICHAR *)dataBufEnd);
|
||||
characterDataHandler(userData, dataBuf, dataPtr - (ICHAR *)dataBuf);
|
||||
}
|
||||
if (startTagLevel == 0) {
|
||||
errorPtr = end;
|
||||
|
@ -1003,9 +1008,9 @@ doContent(XML_Parser parser,
|
|||
case XML_TOK_DATA_CHARS:
|
||||
if (characterDataHandler) {
|
||||
do {
|
||||
char *dataPtr = (char *)dataBuf;
|
||||
XmlConvert(enc, &s, next, &dataPtr, (char *)dataBufEnd);
|
||||
characterDataHandler(userData, dataBuf, (XML_Char *)dataPtr - dataBuf);
|
||||
ICHAR *dataPtr = (ICHAR *)dataBuf;
|
||||
XmlConvert(enc, &s, next, &dataPtr, (ICHAR *)dataBufEnd);
|
||||
characterDataHandler(userData, dataBuf, dataPtr - (ICHAR *)dataBuf);
|
||||
} while (s != next);
|
||||
}
|
||||
break;
|
||||
|
@ -1155,9 +1160,9 @@ enum XML_Error doCdataSection(XML_Parser parser,
|
|||
case XML_TOK_DATA_CHARS:
|
||||
if (characterDataHandler) {
|
||||
do {
|
||||
char *dataPtr = (char *)dataBuf;
|
||||
XmlConvert(encoding, &s, next, &dataPtr, (char *)dataBufEnd);
|
||||
characterDataHandler(userData, dataBuf, (XML_Char *)dataPtr - dataBuf);
|
||||
ICHAR *dataPtr = (ICHAR *)dataBuf;
|
||||
XmlConvert(encoding, &s, next, &dataPtr, (ICHAR *)dataBufEnd);
|
||||
characterDataHandler(userData, dataBuf, dataPtr - (ICHAR *)dataBuf);
|
||||
} while (s != next);
|
||||
}
|
||||
break;
|
||||
|
@ -1521,7 +1526,7 @@ appendAttributeValue(XML_Parser parser, const ENCODING *enc, int isCdata,
|
|||
return XML_ERROR_INVALID_TOKEN;
|
||||
case XML_TOK_CHAR_REF:
|
||||
{
|
||||
XML_Char buf[XML_MAX_BYTES_PER_CHAR];
|
||||
XML_Char buf[XML_ENCODE_MAX];
|
||||
int i;
|
||||
int n = XmlCharRefNumber(enc, ptr);
|
||||
if (n < 0) {
|
||||
|
@ -1532,7 +1537,7 @@ appendAttributeValue(XML_Parser parser, const ENCODING *enc, int isCdata,
|
|||
&& n == 0x20 /* space */
|
||||
&& (poolLength(pool) == 0 || poolLastChar(pool) == XML_T(' ')))
|
||||
break;
|
||||
n = XmlEncode(n, (char *)buf)/sizeof(XML_Char);
|
||||
n = XmlEncode(n, (ICHAR *)buf);
|
||||
if (!n) {
|
||||
errorPtr = ptr;
|
||||
return XML_ERROR_BAD_CHAR_REF;
|
||||
|
@ -1653,14 +1658,14 @@ enum XML_Error storeEntityValue(XML_Parser parser,
|
|||
break;
|
||||
case XML_TOK_CHAR_REF:
|
||||
{
|
||||
XML_Char buf[XML_MAX_BYTES_PER_CHAR];
|
||||
XML_Char buf[XML_ENCODE_MAX];
|
||||
int i;
|
||||
int n = XmlCharRefNumber(encoding, entityTextPtr);
|
||||
if (n < 0) {
|
||||
errorPtr = entityTextPtr;
|
||||
return XML_ERROR_BAD_CHAR_REF;
|
||||
}
|
||||
n = XmlEncode(n, (char *)buf)/sizeof(XML_Char);
|
||||
n = XmlEncode(n, (ICHAR *)buf);
|
||||
if (!n) {
|
||||
errorPtr = entityTextPtr;
|
||||
return XML_ERROR_BAD_CHAR_REF;
|
||||
|
@ -2068,9 +2073,7 @@ XML_Char *poolAppend(STRING_POOL *pool, const ENCODING *enc,
|
|||
if (!pool->ptr && !poolGrow(pool))
|
||||
return 0;
|
||||
for (;;) {
|
||||
/* The cast to (char **) won't work on machines with different
|
||||
representations for char * and wchar_t *. */
|
||||
XmlConvert(enc, &ptr, end, (char **)&(pool->ptr), (char *)pool->end);
|
||||
XmlConvert(enc, &ptr, end, (ICHAR **)&(pool->ptr), (ICHAR *)pool->end);
|
||||
if (ptr == end)
|
||||
break;
|
||||
if (!poolGrow(pool))
|
||||
|
|
|
@ -141,9 +141,40 @@ void utf8_toUtf8(const ENCODING *enc,
|
|||
static
|
||||
void utf8_toUtf16(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
char **toP, const char *toLim)
|
||||
unsigned short **toP, const unsigned short *toLim)
|
||||
{
|
||||
/* FIXME */
|
||||
unsigned short *to = *toP;
|
||||
const char *from = *fromP;
|
||||
while (from != fromLim && to != toLim) {
|
||||
switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
|
||||
case BT_LEAD2:
|
||||
*to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
|
||||
from += 2;
|
||||
break;
|
||||
case BT_LEAD3:
|
||||
*to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
|
||||
from += 3;
|
||||
break;
|
||||
case BT_LEAD4:
|
||||
{
|
||||
unsigned long n;
|
||||
if (to + 1 == toLim)
|
||||
break;
|
||||
n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
|
||||
n -= 0x10000;
|
||||
to[0] = (unsigned short)((n >> 10) | 0xD800);
|
||||
to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
|
||||
to += 2;
|
||||
from += 4;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
*to++ = *from++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
*fromP = from;
|
||||
*toP = to;
|
||||
}
|
||||
|
||||
static const struct normal_encoding utf8_encoding = {
|
||||
|
@ -190,8 +221,10 @@ void latin1_toUtf8(const ENCODING *enc,
|
|||
static
|
||||
void latin1_toUtf16(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
char **toP, const char *toLim)
|
||||
unsigned short **toP, const unsigned short *toLim)
|
||||
{
|
||||
while (*fromP != fromLim && *toP != toLim)
|
||||
*(*toP)++ = (unsigned char)*(*fromP)++;
|
||||
}
|
||||
|
||||
static const struct normal_encoding latin1_encoding = {
|
||||
|
@ -202,8 +235,6 @@ static const struct normal_encoding latin1_encoding = {
|
|||
}
|
||||
};
|
||||
|
||||
#define latin1tab (latin1_encoding.type)
|
||||
|
||||
static
|
||||
void ascii_toUtf8(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
|
@ -213,16 +244,8 @@ void ascii_toUtf8(const ENCODING *enc,
|
|||
*(*toP)++ = *(*fromP)++;
|
||||
}
|
||||
|
||||
static
|
||||
void ascii_toUtf16(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
char **toP, const char *toLim)
|
||||
{
|
||||
/* FIXME */
|
||||
}
|
||||
|
||||
static const struct normal_encoding ascii_encoding = {
|
||||
{ VTABLE1, ascii_toUtf8, ascii_toUtf16, 1 },
|
||||
{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1 },
|
||||
{
|
||||
#include "asciitab.h"
|
||||
/* BT_NONXML == 0 */
|
||||
|
@ -316,15 +339,22 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
|
|||
static \
|
||||
void PREFIX(toUtf16)(const ENCODING *enc, \
|
||||
const char **fromP, const char *fromLim, \
|
||||
char **toP, const char *toLim) \
|
||||
unsigned short **toP, const unsigned short *toLim) \
|
||||
{ \
|
||||
/* FIXME */ \
|
||||
/* Avoid copying first half only of surrogate */ \
|
||||
if (fromLim - *fromP > ((toLim - *toP) << 1) \
|
||||
&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
|
||||
fromLim -= 2; \
|
||||
for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
|
||||
*(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
|
||||
}
|
||||
|
||||
#define PREFIX(ident) little2_ ## ident
|
||||
#define MINBPC 2
|
||||
#define BYTE_TYPE(enc, p) \
|
||||
((p)[1] == 0 ? latin1tab[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0]))
|
||||
((p)[1] == 0 \
|
||||
? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
|
||||
: unicode_byte_type((p)[1], (p)[0]))
|
||||
#define BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
|
||||
#define CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
|
||||
#define IS_NAME_CHAR(enc, p, n) \
|
||||
|
@ -353,7 +383,21 @@ DEFINE_UTF16_TO_UTF16
|
|||
#undef IS_NMSTRT_CHAR
|
||||
#undef IS_INVALID_CHAR
|
||||
|
||||
static const struct encoding little2_encoding = { VTABLE, 2 };
|
||||
static const struct normal_encoding little2_encoding = {
|
||||
{ VTABLE, 2 },
|
||||
#include "asciitab.h"
|
||||
#include "latin1tab.h"
|
||||
};
|
||||
|
||||
#if BYTE_ORDER != 21
|
||||
|
||||
static const struct normal_encoding internal_little2_encoding = {
|
||||
{ VTABLE, 2 },
|
||||
#include "iasciitab.h"
|
||||
#include "latin1tab.h"
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#undef PREFIX
|
||||
|
||||
|
@ -361,7 +405,9 @@ static const struct encoding little2_encoding = { VTABLE, 2 };
|
|||
#define MINBPC 2
|
||||
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
|
||||
#define BYTE_TYPE(enc, p) \
|
||||
((p)[0] == 0 ? latin1tab[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1]))
|
||||
((p)[0] == 0 \
|
||||
? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
|
||||
: unicode_byte_type((p)[0], (p)[1]))
|
||||
#define BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
|
||||
#define CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
|
||||
#define IS_NAME_CHAR(enc, p, n) \
|
||||
|
@ -390,7 +436,21 @@ DEFINE_UTF16_TO_UTF16
|
|||
#undef IS_NMSTRT_CHAR
|
||||
#undef IS_INVALID_CHAR
|
||||
|
||||
static const struct encoding big2_encoding = { VTABLE, 2 };
|
||||
static const struct normal_encoding big2_encoding = {
|
||||
{ VTABLE, 2 },
|
||||
#include "asciitab.h"
|
||||
#include "latin1tab.h"
|
||||
};
|
||||
|
||||
#if BYTE_ORDER != 12
|
||||
|
||||
static const struct normal_encoding internal_big2_encoding = {
|
||||
{ VTABLE, 2 },
|
||||
#include "iasciitab.h"
|
||||
#include "latin1tab.h"
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#undef PREFIX
|
||||
|
||||
|
@ -433,18 +493,18 @@ int initScan(const ENCODING *enc, int state, const char *ptr, const char *end,
|
|||
else {
|
||||
switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
|
||||
case 0x003C:
|
||||
*encPtr = &big2_encoding;
|
||||
*encPtr = &big2_encoding.enc;
|
||||
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
||||
case 0xFEFF:
|
||||
*nextTokPtr = ptr + 2;
|
||||
*encPtr = &big2_encoding;
|
||||
*encPtr = &big2_encoding.enc;
|
||||
return XML_TOK_BOM;
|
||||
case 0x3C00:
|
||||
*encPtr = &little2_encoding;
|
||||
*encPtr = &little2_encoding.enc;
|
||||
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
||||
case 0xFFFE:
|
||||
*nextTokPtr = ptr + 2;
|
||||
*encPtr = &little2_encoding;
|
||||
*encPtr = &little2_encoding.enc;
|
||||
return XML_TOK_BOM;
|
||||
}
|
||||
}
|
||||
|
@ -480,8 +540,14 @@ const ENCODING *XmlGetUtf8InternalEncoding()
|
|||
|
||||
const ENCODING *XmlGetUtf16InternalEncoding()
|
||||
{
|
||||
/* FIXME */
|
||||
return 0;
|
||||
#if BYTE_ORDER == 12
|
||||
return &internal_little2_encoding.enc;
|
||||
#elif BYTE_ORDER == 21
|
||||
return &internal_big2_encoding.enc;
|
||||
#else
|
||||
const short n = 1;
|
||||
return *(const char *)&n ? &internal_little2_encoding.enc : &internal_big2_encoding.enc;
|
||||
#endif
|
||||
}
|
||||
|
||||
int XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr, const char *name)
|
||||
|
@ -644,7 +710,7 @@ const ENCODING *findEncoding(const ENCODING *enc, const char *ptr, const char *e
|
|||
static const unsigned short n = 1;
|
||||
if (enc->minBytesPerChar == 2)
|
||||
return enc;
|
||||
return &big2_encoding;
|
||||
return &big2_encoding.enc;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -780,21 +846,19 @@ int XmlUtf8Encode(int c, char *buf)
|
|||
return 0;
|
||||
}
|
||||
|
||||
int XmlUtf16Encode(int charNum, char *buf)
|
||||
int XmlUtf16Encode(int charNum, unsigned short *buf)
|
||||
{
|
||||
#if 0
|
||||
if (charNum < 0)
|
||||
return 0;
|
||||
if (charNum < 0x10000) {
|
||||
UTF16_SET(buf[0], charNum);
|
||||
buf[0] = charNum;
|
||||
return 1;
|
||||
}
|
||||
if (charNum < 0x110000) {
|
||||
charNum -= 0x10000;
|
||||
UTF16_SET(buf[0], (charNum >> 10) + 0xD800);
|
||||
UTF16_SET(buf[1] = (charNum & 0x3FF) + 0xDC00);
|
||||
buf[0] = (charNum >> 10) + 0xD800;
|
||||
buf[1] = (charNum & 0x3FF) + 0xDC00;
|
||||
return 2;
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -103,8 +103,10 @@ extern "C" {
|
|||
#define XML_ATTRIBUTE_VALUE_LITERAL 0
|
||||
#define XML_ENTITY_VALUE_LITERAL 1
|
||||
|
||||
/* The size of the buffer passed to XmlUtf8Encode and XmlUtf16Encode must be at least this. */
|
||||
#define XML_MAX_BYTES_PER_CHAR 4
|
||||
/* The size of the buffer passed to XmlUtf8Encode must be at least this. */
|
||||
#define XML_UTF8_ENCODE_MAX 4
|
||||
/* The size of the buffer passed to XmlUtf16Encode must be at least this. */
|
||||
#define XML_UTF16_ENCODE_MAX 2
|
||||
|
||||
typedef struct position {
|
||||
/* first line and first column are 0 not 1 */
|
||||
|
@ -154,8 +156,8 @@ struct encoding {
|
|||
void (*utf16Convert)(const ENCODING *enc,
|
||||
const char **fromP,
|
||||
const char *fromLim,
|
||||
char **toP,
|
||||
const char *toLim);
|
||||
unsigned short **toP,
|
||||
const unsigned short *toLim);
|
||||
int minBytesPerChar;
|
||||
};
|
||||
|
||||
|
@ -252,7 +254,7 @@ int XMLTOKAPI XmlInitEncoding(INIT_ENCODING *, const ENCODING **, const char *na
|
|||
const ENCODING XMLTOKAPI *XmlGetUtf8InternalEncoding();
|
||||
const ENCODING XMLTOKAPI *XmlGetUtf16InternalEncoding();
|
||||
int XMLTOKAPI XmlUtf8Encode(int charNumber, char *buf);
|
||||
int XMLTOKAPI XmlUtf16Encode(int charNumber, char *buf);
|
||||
int XMLTOKAPI XmlUtf16Encode(int charNumber, unsigned short *buf);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue