mirror of
https://github.com/libexpat/libexpat.git
synced 2025-04-06 13:45:00 +00:00
Generalize unknown encoding support
This commit is contained in:
parent
e8ac3d8f5f
commit
84835ac373
8 changed files with 330 additions and 81 deletions
|
@ -215,11 +215,14 @@ typedef struct {
|
|||
XML_UnparsedEntityDeclHandler unparsedEntityDeclHandler;
|
||||
XML_NotationDeclHandler notationDeclHandler;
|
||||
XML_ExternalEntityRefHandler externalEntityRefHandler;
|
||||
XML_SingleByteEncodingHandler singleByteEncodingHandler;
|
||||
XML_UnknownEncodingHandler unknownEncodingHandler;
|
||||
const ENCODING *encoding;
|
||||
INIT_ENCODING initEncoding;
|
||||
const XML_Char *protocolEncodingName;
|
||||
void *singleByteEncodingMem;
|
||||
void *unknownEncodingMem;
|
||||
void *unknownEncodingData;
|
||||
void *unknownEncodingHandlerData;
|
||||
void (*unknownEncodingRelease)(void *);
|
||||
PROLOG_STATE prologState;
|
||||
Processor *processor;
|
||||
enum XML_Error errorCode;
|
||||
|
@ -253,10 +256,14 @@ typedef struct {
|
|||
#define unparsedEntityDeclHandler (((Parser *)parser)->unparsedEntityDeclHandler)
|
||||
#define notationDeclHandler (((Parser *)parser)->notationDeclHandler)
|
||||
#define externalEntityRefHandler (((Parser *)parser)->externalEntityRefHandler)
|
||||
#define singleByteEncodingHandler (((Parser *)parser)->singleByteEncodingHandler)
|
||||
#define unknownEncodingHandler (((Parser *)parser)->unknownEncodingHandler)
|
||||
#define encoding (((Parser *)parser)->encoding)
|
||||
#define initEncoding (((Parser *)parser)->initEncoding)
|
||||
#define singleByteEncodingMem (((Parser *)parser)->singleByteEncodingMem)
|
||||
#define unknownEncodingMem (((Parser *)parser)->unknownEncodingMem)
|
||||
#define unknownEncodingData (((Parser *)parser)->unknownEncodingData)
|
||||
#define unknownEncodingHandlerData \
|
||||
(((Parser *)parser)->unknownEncodingHandlerData)
|
||||
#define unknownEncodingRelease (((Parser *)parser)->unknownEncodingRelease)
|
||||
#define protocolEncodingName (((Parser *)parser)->protocolEncodingName)
|
||||
#define prologState (((Parser *)parser)->prologState)
|
||||
#define processor (((Parser *)parser)->processor)
|
||||
|
@ -304,7 +311,7 @@ XML_Parser XML_ParserCreate(const XML_Char *encodingName)
|
|||
unparsedEntityDeclHandler = 0;
|
||||
notationDeclHandler = 0;
|
||||
externalEntityRefHandler = 0;
|
||||
singleByteEncodingHandler = 0;
|
||||
unknownEncodingHandler = 0;
|
||||
buffer = 0;
|
||||
bufferPtr = 0;
|
||||
bufferEnd = 0;
|
||||
|
@ -328,7 +335,10 @@ XML_Parser XML_ParserCreate(const XML_Char *encodingName)
|
|||
groupSize = 0;
|
||||
groupConnector = 0;
|
||||
hadExternalDoctype = 0;
|
||||
singleByteEncodingMem = 0;
|
||||
unknownEncodingMem = 0;
|
||||
unknownEncodingRelease = 0;
|
||||
unknownEncodingData = 0;
|
||||
unknownEncodingHandlerData = 0;
|
||||
poolInit(&tempPool);
|
||||
poolInit(&temp2Pool);
|
||||
protocolEncodingName = encodingName ? poolCopyString(&tempPool, encodingName) : 0;
|
||||
|
@ -353,7 +363,7 @@ XML_Parser XML_ExternalEntityParserCreate(XML_Parser oldParser,
|
|||
XML_CharacterDataHandler oldCharacterDataHandler = characterDataHandler;
|
||||
XML_ProcessingInstructionHandler oldProcessingInstructionHandler = processingInstructionHandler;
|
||||
XML_ExternalEntityRefHandler oldExternalEntityRefHandler = externalEntityRefHandler;
|
||||
XML_SingleByteEncodingHandler oldSingleByteEncodingHandler = singleByteEncodingHandler;
|
||||
XML_UnknownEncodingHandler oldUnknownEncodingHandler = unknownEncodingHandler;
|
||||
void *oldUserData = userData;
|
||||
|
||||
parser = XML_ParserCreate(encodingName);
|
||||
|
@ -364,7 +374,7 @@ XML_Parser XML_ExternalEntityParserCreate(XML_Parser oldParser,
|
|||
characterDataHandler = oldCharacterDataHandler;
|
||||
processingInstructionHandler = oldProcessingInstructionHandler;
|
||||
externalEntityRefHandler = oldExternalEntityRefHandler;
|
||||
singleByteEncodingHandler = oldSingleByteEncodingHandler;
|
||||
unknownEncodingHandler = oldUnknownEncodingHandler;
|
||||
userData = oldUserData;
|
||||
if (!dtdCopy(&dtd, oldDtd) || !setOpenEntityNames(parser, openEntityNames)) {
|
||||
XML_ParserFree(parser);
|
||||
|
@ -396,7 +406,9 @@ void XML_ParserFree(XML_Parser parser)
|
|||
free(groupConnector);
|
||||
free(buffer);
|
||||
free(dataBuf);
|
||||
free(singleByteEncodingMem);
|
||||
free(unknownEncodingMem);
|
||||
if (unknownEncodingRelease)
|
||||
unknownEncodingRelease(unknownEncodingData);
|
||||
free(parser);
|
||||
}
|
||||
|
||||
|
@ -461,10 +473,12 @@ void XML_SetExternalEntityRefHandler(XML_Parser parser,
|
|||
externalEntityRefHandler = handler;
|
||||
}
|
||||
|
||||
void XML_SetSingleByteEncodingHandler(XML_Parser parser,
|
||||
XML_SingleByteEncodingHandler handler)
|
||||
void XML_SetUnknownEncodingHandler(XML_Parser parser,
|
||||
XML_UnknownEncodingHandler handler,
|
||||
void *data)
|
||||
{
|
||||
singleByteEncodingHandler = handler;
|
||||
unknownEncodingHandler = handler;
|
||||
unknownEncodingHandlerData = data;
|
||||
}
|
||||
|
||||
int XML_Parse(XML_Parser parser, const char *s, int len, int isFinal)
|
||||
|
@ -1331,22 +1345,35 @@ processXmlDecl(XML_Parser parser, int isGeneralTextEntity,
|
|||
static enum XML_Error
|
||||
handleUnknownEncoding(XML_Parser parser, const XML_Char *encodingName)
|
||||
{
|
||||
if (singleByteEncodingHandler) {
|
||||
unsigned short table[256];
|
||||
if (unknownEncodingHandler) {
|
||||
XML_Encoding info;
|
||||
int i;
|
||||
for (i = 0; i < 256; i++)
|
||||
table[i] = 0;
|
||||
if (singleByteEncodingHandler(userData, encodingName, table)) {
|
||||
info.map[i] = 0;
|
||||
info.convert = 0;
|
||||
info.data = 0;
|
||||
info.release = 0;
|
||||
if (unknownEncodingHandler(unknownEncodingHandlerData, encodingName, &info)) {
|
||||
ENCODING *enc;
|
||||
singleByteEncodingMem = malloc(XmlSizeOfSingleByteEncoding());
|
||||
if (!singleByteEncodingMem)
|
||||
unknownEncodingMem = malloc(XmlSizeOfUnknownEncoding());
|
||||
if (!unknownEncodingMem) {
|
||||
if (info.release)
|
||||
info.release(info.data);
|
||||
return XML_ERROR_NO_MEMORY;
|
||||
enc = XmlInitSingleByteEncoding(singleByteEncodingMem, table);
|
||||
}
|
||||
enc = XmlInitUnknownEncoding(unknownEncodingMem,
|
||||
info.map,
|
||||
info.convert,
|
||||
info.data);
|
||||
if (enc) {
|
||||
unknownEncodingData = info.data;
|
||||
unknownEncodingRelease = info.release;
|
||||
encoding = enc;
|
||||
return XML_ERROR_NONE;
|
||||
}
|
||||
}
|
||||
if (info.release)
|
||||
info.release(info.data);
|
||||
}
|
||||
return XML_ERROR_UNKNOWN_ENCODING;
|
||||
}
|
||||
|
|
|
@ -110,10 +110,19 @@ typedef int (*XML_ExternalEntityRefHandler)(XML_Parser parser,
|
|||
const XML_Char *systemId,
|
||||
const XML_Char *publicId);
|
||||
|
||||
typedef struct {
|
||||
unsigned short map[256];
|
||||
void *data;
|
||||
unsigned short (*convert)(void *data, const char *s);
|
||||
void (*release)(void *data);
|
||||
} XML_Encoding;
|
||||
|
||||
typedef int (*XML_SingleByteEncodingHandler)(void *userData,
|
||||
const XML_Char *encoding,
|
||||
unsigned short *table);
|
||||
/* The encodingHandlerData passed to this call is that which was passed as the
|
||||
second argument to XML_SetUnknownEncodingHandler. */
|
||||
|
||||
typedef int (*XML_UnknownEncodingHandler)(void *encodingHandlerData,
|
||||
const XML_Char *name,
|
||||
XML_Encoding *info);
|
||||
|
||||
void XMLPARSEAPI
|
||||
XML_SetElementHandler(XML_Parser parser,
|
||||
|
@ -141,8 +150,9 @@ XML_SetExternalEntityRefHandler(XML_Parser parser,
|
|||
XML_ExternalEntityRefHandler handler);
|
||||
|
||||
void XMLPARSEAPI
|
||||
XML_SetSingleByteEncodingHandler(XML_Parser parser,
|
||||
XML_SingleByteEncodingHandler handler);
|
||||
XML_SetUnknownEncodingHandler(XML_Parser parser,
|
||||
XML_UnknownEncodingHandler handler,
|
||||
void *encodingHandlerData);
|
||||
|
||||
/* This value is passed as the userData argument to callbacks. */
|
||||
void XMLPARSEAPI
|
||||
|
|
|
@ -78,11 +78,79 @@ We need 8 bits to index into pages, 3 bits to add to that index and
|
|||
|
||||
#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
|
||||
|
||||
static
|
||||
int isNever(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static
|
||||
int utf8_isName2(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
|
||||
}
|
||||
|
||||
static
|
||||
int utf8_isName3(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
|
||||
}
|
||||
|
||||
#define utf8_isName4 isNever
|
||||
|
||||
static
|
||||
int utf8_isNmstrt2(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
|
||||
}
|
||||
|
||||
static
|
||||
int utf8_isNmstrt3(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
|
||||
}
|
||||
|
||||
#define utf8_isNmstrt4 isNever
|
||||
|
||||
#define utf8_isInvalid2 isNever
|
||||
|
||||
static
|
||||
int utf8_isInvalid3(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return UTF8_INVALID3((const unsigned char *)p);
|
||||
}
|
||||
|
||||
static
|
||||
int utf8_isInvalid4(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return UTF8_INVALID4((const unsigned char *)p);
|
||||
}
|
||||
|
||||
struct normal_encoding {
|
||||
ENCODING enc;
|
||||
unsigned char type[256];
|
||||
int (*isName2)(const ENCODING *, const char *);
|
||||
int (*isName3)(const ENCODING *, const char *);
|
||||
int (*isName4)(const ENCODING *, const char *);
|
||||
int (*isNmstrt2)(const ENCODING *, const char *);
|
||||
int (*isNmstrt3)(const ENCODING *, const char *);
|
||||
int (*isNmstrt4)(const ENCODING *, const char *);
|
||||
int (*isInvalid2)(const ENCODING *, const char *);
|
||||
int (*isInvalid3)(const ENCODING *, const char *);
|
||||
int (*isInvalid4)(const ENCODING *, const char *);
|
||||
};
|
||||
|
||||
#define NORMAL_VTABLE(E) \
|
||||
E ## isName2, \
|
||||
E ## isName3, \
|
||||
E ## isName4, \
|
||||
E ## isNmstrt2, \
|
||||
E ## isNmstrt3, \
|
||||
E ## isNmstrt4, \
|
||||
E ## isInvalid2, \
|
||||
E ## isInvalid3, \
|
||||
E ## isInvalid4
|
||||
|
||||
static int checkCharRefNumber(int);
|
||||
|
||||
#include "xmltok_impl.h"
|
||||
|
@ -92,12 +160,16 @@ static int checkCharRefNumber(int);
|
|||
#define BYTE_TYPE(enc, p) \
|
||||
(((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
|
||||
#define BYTE_TO_ASCII(enc, p) (*p)
|
||||
#define IS_NAME_CHAR(enc, p, n) UTF8_GET_NAMING(namePages, p, n)
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) UTF8_GET_NAMING(nmstrtPages, p, n)
|
||||
|
||||
#define IS_NAME_CHAR(enc, p, n) \
|
||||
(((const struct normal_encoding *)(enc))->isName ## n(enc, p))
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) \
|
||||
(((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
|
||||
#define IS_INVALID_CHAR(enc, p, n) \
|
||||
((n) == 3 \
|
||||
? UTF8_INVALID3((const unsigned char *)(p)) \
|
||||
: ((n) == 4 ? UTF8_INVALID4((const unsigned char *)(p)) : 0))
|
||||
(((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
|
||||
|
||||
#define IS_NAME_CHAR_MINBPC(enc, p) (0)
|
||||
#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
|
||||
|
||||
/* c is an ASCII character */
|
||||
#define CHAR_MATCHES(enc, p, c) (*(p) == c)
|
||||
|
@ -110,7 +182,9 @@ static int checkCharRefNumber(int);
|
|||
#undef BYTE_TO_ASCII
|
||||
#undef CHAR_MATCHES
|
||||
#undef IS_NAME_CHAR
|
||||
#undef IS_NAME_CHAR_MINBPC
|
||||
#undef IS_NMSTRT_CHAR
|
||||
#undef IS_NMSTRT_CHAR_MINBPC
|
||||
#undef IS_INVALID_CHAR
|
||||
|
||||
enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
|
||||
|
@ -183,7 +257,8 @@ static const struct normal_encoding utf8_encoding = {
|
|||
{
|
||||
#include "asciitab.h"
|
||||
#include "utf8tab.h"
|
||||
}
|
||||
},
|
||||
NORMAL_VTABLE(utf8_)
|
||||
};
|
||||
|
||||
static const struct normal_encoding internal_utf8_encoding = {
|
||||
|
@ -191,7 +266,8 @@ static const struct normal_encoding internal_utf8_encoding = {
|
|||
{
|
||||
#include "iasciitab.h"
|
||||
#include "utf8tab.h"
|
||||
}
|
||||
},
|
||||
NORMAL_VTABLE(utf8_)
|
||||
};
|
||||
|
||||
static
|
||||
|
@ -358,9 +434,11 @@ void PREFIX(toUtf16)(const ENCODING *enc, \
|
|||
: unicode_byte_type((p)[1], (p)[0]))
|
||||
#define BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
|
||||
#define CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
|
||||
#define IS_NAME_CHAR(enc, p, n) \
|
||||
#define IS_NAME_CHAR(enc, p, n) (0)
|
||||
#define IS_NAME_CHAR_MINBPC(enc, p) \
|
||||
UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) \
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) (0)
|
||||
#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
|
||||
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
|
||||
|
||||
#include "xmltok_impl.c"
|
||||
|
@ -381,7 +459,9 @@ DEFINE_UTF16_TO_UTF16
|
|||
#undef BYTE_TO_ASCII
|
||||
#undef CHAR_MATCHES
|
||||
#undef IS_NAME_CHAR
|
||||
#undef IS_NAME_CHAR_MINBPC
|
||||
#undef IS_NMSTRT_CHAR
|
||||
#undef IS_NMSTRT_CHAR_MINBPC
|
||||
#undef IS_INVALID_CHAR
|
||||
|
||||
static const struct normal_encoding little2_encoding = {
|
||||
|
@ -417,9 +497,11 @@ static const struct normal_encoding internal_little2_encoding = {
|
|||
: unicode_byte_type((p)[0], (p)[1]))
|
||||
#define BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
|
||||
#define CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
|
||||
#define IS_NAME_CHAR(enc, p, n) \
|
||||
#define IS_NAME_CHAR(enc, p, n) 0
|
||||
#define IS_NAME_CHAR_MINBPC(enc, p) \
|
||||
UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) \
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) (0)
|
||||
#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
|
||||
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
|
||||
|
||||
#include "xmltok_impl.c"
|
||||
|
@ -440,7 +522,9 @@ DEFINE_UTF16_TO_UTF16
|
|||
#undef BYTE_TO_ASCII
|
||||
#undef CHAR_MATCHES
|
||||
#undef IS_NAME_CHAR
|
||||
#undef IS_NAME_CHAR_MINBPC
|
||||
#undef IS_NMSTRT_CHAR
|
||||
#undef IS_NMSTRT_CHAR_MINBPC
|
||||
#undef IS_INVALID_CHAR
|
||||
|
||||
static const struct normal_encoding big2_encoding = {
|
||||
|
@ -876,51 +960,105 @@ int XmlUtf16Encode(int charNum, unsigned short *buf)
|
|||
return 0;
|
||||
}
|
||||
|
||||
struct single_encoding {
|
||||
struct unknown_encoding {
|
||||
struct normal_encoding normal;
|
||||
unsigned short (*convert)(void *userData, const char *p);
|
||||
void *userData;
|
||||
unsigned short utf16[256];
|
||||
unsigned char utf8[256][4];
|
||||
};
|
||||
|
||||
int XmlSizeOfSingleByteEncoding()
|
||||
int XmlSizeOfUnknownEncoding()
|
||||
{
|
||||
return sizeof(struct single_encoding);
|
||||
return sizeof(struct unknown_encoding);
|
||||
}
|
||||
|
||||
static
|
||||
void single_toUtf8(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
char **toP, const char *toLim)
|
||||
int unknown_isName(const ENCODING *enc, const char *p)
|
||||
{
|
||||
unsigned short c = ((const struct unknown_encoding *)enc)
|
||||
->convert(((const struct unknown_encoding *)enc)->userData, p);
|
||||
return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
|
||||
}
|
||||
|
||||
static
|
||||
int unknown_isNmstrt(const ENCODING *enc, const char *p)
|
||||
{
|
||||
unsigned short c = ((const struct unknown_encoding *)enc)
|
||||
->convert(((const struct unknown_encoding *)enc)->userData, p);
|
||||
return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
|
||||
}
|
||||
|
||||
static
|
||||
int unknown_isInvalid(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return ((const struct unknown_encoding *)enc)
|
||||
->convert(((const struct unknown_encoding *)enc)->userData, p) == 0;
|
||||
}
|
||||
|
||||
static
|
||||
void unknown_toUtf8(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
char **toP, const char *toLim)
|
||||
{
|
||||
char buf[XML_UTF8_ENCODE_MAX];
|
||||
for (;;) {
|
||||
const unsigned char *utf8;
|
||||
int n;
|
||||
if (*fromP == fromLim)
|
||||
break;
|
||||
utf8 = ((const struct single_encoding *)enc)->utf8[(unsigned char)**fromP];
|
||||
utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
|
||||
n = *utf8++;
|
||||
if (n > toLim - *toP)
|
||||
break;
|
||||
if (n == 0) {
|
||||
unsigned short c
|
||||
= ((const struct unknown_encoding *)enc)
|
||||
->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
|
||||
n = XmlUtf8Encode(c, buf);
|
||||
if (n > toLim - *toP)
|
||||
break;
|
||||
utf8 = buf;
|
||||
*fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
|
||||
- (BT_LEAD2 - 2);
|
||||
}
|
||||
else {
|
||||
if (n > toLim - *toP)
|
||||
break;
|
||||
(*fromP)++;
|
||||
}
|
||||
do {
|
||||
*(*toP)++ = *utf8++;
|
||||
} while (--n != 0);
|
||||
(*fromP)++;
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void single_toUtf16(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
unsigned short **toP, const unsigned short *toLim)
|
||||
void unknown_toUtf16(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
unsigned short **toP, const unsigned short *toLim)
|
||||
{
|
||||
while (*fromP != fromLim && *toP != toLim)
|
||||
*(*toP)++ = ((const struct single_encoding *)enc)->utf16[(unsigned char)*(*fromP)++];
|
||||
while (*fromP != fromLim && *toP != toLim) {
|
||||
unsigned short c
|
||||
= ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
|
||||
if (c == 0) {
|
||||
c = ((const struct unknown_encoding *)enc)
|
||||
->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
|
||||
*fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
|
||||
- (BT_LEAD2 - 2);
|
||||
}
|
||||
else
|
||||
(*fromP)++;
|
||||
*(*toP)++ = c;
|
||||
}
|
||||
}
|
||||
|
||||
ENCODING *XmlInitSingleByteEncoding(void *mem, unsigned short *table)
|
||||
ENCODING *
|
||||
XmlInitUnknownEncoding(void *mem,
|
||||
unsigned short *table,
|
||||
unsigned short (*convert)(void *userData, const char *p),
|
||||
void *userData)
|
||||
{
|
||||
int i;
|
||||
struct single_encoding *e = mem;
|
||||
struct unknown_encoding *e = mem;
|
||||
for (i = 0; i < sizeof(struct normal_encoding); i++)
|
||||
((char *)mem)[i] = ((char *)&latin1_encoding)[i];
|
||||
for (i = 0; i < 128; i++)
|
||||
|
@ -935,12 +1073,24 @@ ENCODING *XmlInitSingleByteEncoding(void *mem, unsigned short *table)
|
|||
&& latin1_encoding.type[c] != BT_NONXML
|
||||
&& c != i)
|
||||
return 0;
|
||||
e->normal.type[i] = latin1_encoding.type[c];
|
||||
e->utf8[i][0] = 1;
|
||||
e->utf8[i][1] = (char)c;
|
||||
if (c >= 2 && c <= 4) {
|
||||
e->normal.type[i] = BT_LEAD2 + (c - 2);
|
||||
e->utf8[i][0] = 0;
|
||||
e->utf16[i] = 0;
|
||||
}
|
||||
else {
|
||||
e->normal.type[i] = latin1_encoding.type[c];
|
||||
e->utf8[i][0] = 1;
|
||||
e->utf8[i][1] = (char)c;
|
||||
e->utf16[i] = c == 0 ? 0xFFFF : c;
|
||||
}
|
||||
}
|
||||
else if (checkCharRefNumber(c) < 0)
|
||||
else if (checkCharRefNumber(c) < 0) {
|
||||
e->normal.type[i] = BT_NONXML;
|
||||
e->utf16[i] = 0xFFFF;
|
||||
e->utf8[i][0] = 1;
|
||||
e->utf8[i][1] = 0;
|
||||
}
|
||||
else {
|
||||
if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
|
||||
e->normal.type[i] = BT_NMSTRT;
|
||||
|
@ -949,10 +1099,23 @@ ENCODING *XmlInitSingleByteEncoding(void *mem, unsigned short *table)
|
|||
else
|
||||
e->normal.type[i] = BT_OTHER;
|
||||
e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
|
||||
e->utf16[i] = c;
|
||||
}
|
||||
e->utf16[i] = c;
|
||||
}
|
||||
e->normal.enc.utf8Convert = single_toUtf8;
|
||||
e->normal.enc.utf16Convert = single_toUtf16;
|
||||
e->userData = userData;
|
||||
e->convert = convert;
|
||||
if (convert) {
|
||||
e->normal.isName2 = unknown_isName;
|
||||
e->normal.isName3 = unknown_isName;
|
||||
e->normal.isName4 = unknown_isName;
|
||||
e->normal.isNmstrt2 = unknown_isNmstrt;
|
||||
e->normal.isNmstrt3 = unknown_isNmstrt;
|
||||
e->normal.isNmstrt4 = unknown_isNmstrt;
|
||||
e->normal.isInvalid2 = unknown_isInvalid;
|
||||
e->normal.isInvalid3 = unknown_isInvalid;
|
||||
e->normal.isInvalid4 = unknown_isInvalid;
|
||||
}
|
||||
e->normal.enc.utf8Convert = unknown_toUtf8;
|
||||
e->normal.enc.utf16Convert = unknown_toUtf16;
|
||||
return &(e->normal.enc);
|
||||
}
|
||||
|
|
|
@ -262,8 +262,12 @@ const ENCODING XMLTOKAPI *XmlGetUtf16InternalEncoding();
|
|||
int XMLTOKAPI XmlUtf8Encode(int charNumber, char *buf);
|
||||
int XMLTOKAPI XmlUtf16Encode(int charNumber, unsigned short *buf);
|
||||
|
||||
int XMLTOKAPI XmlSizeOfSingleByteEncoding();
|
||||
ENCODING XMLTOKAPI *XmlInitSingleByteEncoding(void *mem, unsigned short *table);
|
||||
int XMLTOKAPI XmlSizeOfUnknownEncoding();
|
||||
ENCODING XMLTOKAPI *
|
||||
XmlInitUnknownEncoding(void *mem,
|
||||
unsigned short *table,
|
||||
unsigned short (*convert)(void *userData, const char *p),
|
||||
void *userData);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -56,7 +56,7 @@ Contributor(s):
|
|||
|
||||
#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
|
||||
case BT_NONASCII: \
|
||||
if (!IS_NAME_CHAR(enc, ptr, MINBPC)) { \
|
||||
if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
|
||||
*nextTokPtr = ptr; \
|
||||
return XML_TOK_INVALID; \
|
||||
} \
|
||||
|
@ -84,7 +84,7 @@ Contributor(s):
|
|||
|
||||
#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
|
||||
case BT_NONASCII: \
|
||||
if (!IS_NMSTRT_CHAR(enc, ptr, MINBPC)) { \
|
||||
if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
|
||||
*nextTokPtr = ptr; \
|
||||
return XML_TOK_INVALID; \
|
||||
} \
|
||||
|
@ -1082,12 +1082,12 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
ptr += MINBPC;
|
||||
break;
|
||||
case BT_NONASCII:
|
||||
if (IS_NMSTRT_CHAR(enc, ptr, MINBPC)) {
|
||||
if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
|
||||
ptr += MINBPC;
|
||||
tok = XML_TOK_NAME;
|
||||
break;
|
||||
}
|
||||
if (IS_NAME_CHAR(enc, ptr, MINBPC)) {
|
||||
if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
|
||||
ptr += MINBPC;
|
||||
tok = XML_TOK_NMTOKEN;
|
||||
break;
|
||||
|
|
|
@ -23,24 +23,52 @@ Contributor(s):
|
|||
#ifdef WIN32
|
||||
#include <windows.h>
|
||||
|
||||
int codepage(int cp, unsigned short *map)
|
||||
int codepageMap(int cp, unsigned short *map)
|
||||
{
|
||||
int i;
|
||||
CPINFO info;
|
||||
if (!GetCPInfo(cp, &info) || info.MaxCharSize > 1)
|
||||
if (!GetCPInfo(cp, &info) || info.MaxCharSize > 2)
|
||||
return 0;
|
||||
for (i = 0; i < 256; i++)
|
||||
map[i] = 0;
|
||||
if (info.MaxCharSize > 1) {
|
||||
for (i = 0; i < MAX_LEADBYTES; i++) {
|
||||
int j, lim;
|
||||
if (info.LeadByte[i] == 0 && info.LeadByte[i + 1] == 0)
|
||||
break;
|
||||
lim = info.LeadByte[i + 1];
|
||||
for (j = info.LeadByte[i]; j < lim; j++)
|
||||
map[j] = 2;
|
||||
}
|
||||
}
|
||||
for (i = 0; i < 256; i++) {
|
||||
char c = i;
|
||||
if (MultiByteToWideChar(cp, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
|
||||
&c, 1, map + i, 1) == 0)
|
||||
map[i] = 0;
|
||||
if (map[i] == 0) {
|
||||
char c = i;
|
||||
if (MultiByteToWideChar(cp, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
|
||||
&c, 1, map + i, 1) == 0)
|
||||
map[i] = 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
unsigned short codepageConvert(int cp, const char *p)
|
||||
{
|
||||
unsigned short c;
|
||||
if (MultiByteToWideChar(cp, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
|
||||
p, 2, &c, 1) == 1)
|
||||
return c;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else /* not WIN32 */
|
||||
|
||||
int codepage(int cp, unsigned short *map)
|
||||
int codepageMap(int cp, unsigned short *map)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned short codepageConvert(int cp, const char *p)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -18,4 +18,5 @@ James Clark. All Rights Reserved.
|
|||
Contributor(s):
|
||||
*/
|
||||
|
||||
int codepage(int cp, unsigned short *map);
|
||||
int codepageMap(int cp, unsigned short *map);
|
||||
unsigned short codepageConvert(int cp, const char *p);
|
||||
|
|
|
@ -369,9 +369,15 @@ int externalEntityRefStream(XML_Parser parser,
|
|||
}
|
||||
|
||||
static
|
||||
int singleByteEncoding(void *userData,
|
||||
const XML_Char *encoding,
|
||||
unsigned short *table)
|
||||
unsigned short unknownEncodingConvert(void *data, const char *p)
|
||||
{
|
||||
return codepageConvert(*(int *)data, p);
|
||||
}
|
||||
|
||||
static
|
||||
int unknownEncoding(void *userData,
|
||||
const XML_Char *name,
|
||||
XML_Encoding *info)
|
||||
{
|
||||
int cp;
|
||||
static const XML_Char prefixL[] = T("windows-");
|
||||
|
@ -379,13 +385,13 @@ int singleByteEncoding(void *userData,
|
|||
int i;
|
||||
|
||||
for (i = 0; prefixU[i]; i++)
|
||||
if (encoding[i] != prefixU[i] && encoding[i] != prefixL[i])
|
||||
if (name[i] != prefixU[i] && name[i] != prefixL[i])
|
||||
return 0;
|
||||
|
||||
cp = 0;
|
||||
for (; encoding[i]; i++) {
|
||||
for (; name[i]; i++) {
|
||||
static const XML_Char digits[] = T("0123456789");
|
||||
const XML_Char *s = tcschr(digits, encoding[i]);
|
||||
const XML_Char *s = tcschr(digits, name[i]);
|
||||
if (!s)
|
||||
return 0;
|
||||
cp *= 10;
|
||||
|
@ -393,7 +399,17 @@ int singleByteEncoding(void *userData,
|
|||
if (cp >= 0x10000)
|
||||
return 0;
|
||||
}
|
||||
return codepage(cp, table);
|
||||
if (!codepageMap(cp, info->map))
|
||||
return 0;
|
||||
info->convert = unknownEncodingConvert;
|
||||
/* We could just cast the code page integer to a void *,
|
||||
and avoid the use of release. */
|
||||
info->release = free;
|
||||
info->data = malloc(sizeof(int));
|
||||
if (!info->data)
|
||||
return 0;
|
||||
*(int *)info->data = cp;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static
|
||||
|
@ -498,7 +514,7 @@ int tmain(int argc, XML_Char **argv)
|
|||
#endif
|
||||
}
|
||||
if (windowsCodePages)
|
||||
XML_SetSingleByteEncodingHandler(parser, singleByteEncoding);
|
||||
XML_SetUnknownEncodingHandler(parser, unknownEncoding, 0);
|
||||
if (processExternalEntities) {
|
||||
if (!XML_SetBase(parser, argv[i])) {
|
||||
ftprintf(stderr, T("%s: out of memory"), argv[0]);
|
||||
|
|
Loading…
Add table
Reference in a new issue