Generalize unknown encoding support

This commit is contained in:
James Clark 1998-06-02 08:57:14 +00:00
parent e8ac3d8f5f
commit 84835ac373
8 changed files with 330 additions and 81 deletions

View file

@ -215,11 +215,14 @@ typedef struct {
XML_UnparsedEntityDeclHandler unparsedEntityDeclHandler;
XML_NotationDeclHandler notationDeclHandler;
XML_ExternalEntityRefHandler externalEntityRefHandler;
XML_SingleByteEncodingHandler singleByteEncodingHandler;
XML_UnknownEncodingHandler unknownEncodingHandler;
const ENCODING *encoding;
INIT_ENCODING initEncoding;
const XML_Char *protocolEncodingName;
void *singleByteEncodingMem;
void *unknownEncodingMem;
void *unknownEncodingData;
void *unknownEncodingHandlerData;
void (*unknownEncodingRelease)(void *);
PROLOG_STATE prologState;
Processor *processor;
enum XML_Error errorCode;
@ -253,10 +256,14 @@ typedef struct {
#define unparsedEntityDeclHandler (((Parser *)parser)->unparsedEntityDeclHandler)
#define notationDeclHandler (((Parser *)parser)->notationDeclHandler)
#define externalEntityRefHandler (((Parser *)parser)->externalEntityRefHandler)
#define singleByteEncodingHandler (((Parser *)parser)->singleByteEncodingHandler)
#define unknownEncodingHandler (((Parser *)parser)->unknownEncodingHandler)
#define encoding (((Parser *)parser)->encoding)
#define initEncoding (((Parser *)parser)->initEncoding)
#define singleByteEncodingMem (((Parser *)parser)->singleByteEncodingMem)
#define unknownEncodingMem (((Parser *)parser)->unknownEncodingMem)
#define unknownEncodingData (((Parser *)parser)->unknownEncodingData)
#define unknownEncodingHandlerData \
(((Parser *)parser)->unknownEncodingHandlerData)
#define unknownEncodingRelease (((Parser *)parser)->unknownEncodingRelease)
#define protocolEncodingName (((Parser *)parser)->protocolEncodingName)
#define prologState (((Parser *)parser)->prologState)
#define processor (((Parser *)parser)->processor)
@ -304,7 +311,7 @@ XML_Parser XML_ParserCreate(const XML_Char *encodingName)
unparsedEntityDeclHandler = 0;
notationDeclHandler = 0;
externalEntityRefHandler = 0;
singleByteEncodingHandler = 0;
unknownEncodingHandler = 0;
buffer = 0;
bufferPtr = 0;
bufferEnd = 0;
@ -328,7 +335,10 @@ XML_Parser XML_ParserCreate(const XML_Char *encodingName)
groupSize = 0;
groupConnector = 0;
hadExternalDoctype = 0;
singleByteEncodingMem = 0;
unknownEncodingMem = 0;
unknownEncodingRelease = 0;
unknownEncodingData = 0;
unknownEncodingHandlerData = 0;
poolInit(&tempPool);
poolInit(&temp2Pool);
protocolEncodingName = encodingName ? poolCopyString(&tempPool, encodingName) : 0;
@ -353,7 +363,7 @@ XML_Parser XML_ExternalEntityParserCreate(XML_Parser oldParser,
XML_CharacterDataHandler oldCharacterDataHandler = characterDataHandler;
XML_ProcessingInstructionHandler oldProcessingInstructionHandler = processingInstructionHandler;
XML_ExternalEntityRefHandler oldExternalEntityRefHandler = externalEntityRefHandler;
XML_SingleByteEncodingHandler oldSingleByteEncodingHandler = singleByteEncodingHandler;
XML_UnknownEncodingHandler oldUnknownEncodingHandler = unknownEncodingHandler;
void *oldUserData = userData;
parser = XML_ParserCreate(encodingName);
@ -364,7 +374,7 @@ XML_Parser XML_ExternalEntityParserCreate(XML_Parser oldParser,
characterDataHandler = oldCharacterDataHandler;
processingInstructionHandler = oldProcessingInstructionHandler;
externalEntityRefHandler = oldExternalEntityRefHandler;
singleByteEncodingHandler = oldSingleByteEncodingHandler;
unknownEncodingHandler = oldUnknownEncodingHandler;
userData = oldUserData;
if (!dtdCopy(&dtd, oldDtd) || !setOpenEntityNames(parser, openEntityNames)) {
XML_ParserFree(parser);
@ -396,7 +406,9 @@ void XML_ParserFree(XML_Parser parser)
free(groupConnector);
free(buffer);
free(dataBuf);
free(singleByteEncodingMem);
free(unknownEncodingMem);
if (unknownEncodingRelease)
unknownEncodingRelease(unknownEncodingData);
free(parser);
}
@ -461,10 +473,12 @@ void XML_SetExternalEntityRefHandler(XML_Parser parser,
externalEntityRefHandler = handler;
}
void XML_SetSingleByteEncodingHandler(XML_Parser parser,
XML_SingleByteEncodingHandler handler)
void XML_SetUnknownEncodingHandler(XML_Parser parser,
XML_UnknownEncodingHandler handler,
void *data)
{
singleByteEncodingHandler = handler;
unknownEncodingHandler = handler;
unknownEncodingHandlerData = data;
}
int XML_Parse(XML_Parser parser, const char *s, int len, int isFinal)
@ -1331,22 +1345,35 @@ processXmlDecl(XML_Parser parser, int isGeneralTextEntity,
static enum XML_Error
handleUnknownEncoding(XML_Parser parser, const XML_Char *encodingName)
{
if (singleByteEncodingHandler) {
unsigned short table[256];
if (unknownEncodingHandler) {
XML_Encoding info;
int i;
for (i = 0; i < 256; i++)
table[i] = 0;
if (singleByteEncodingHandler(userData, encodingName, table)) {
info.map[i] = 0;
info.convert = 0;
info.data = 0;
info.release = 0;
if (unknownEncodingHandler(unknownEncodingHandlerData, encodingName, &info)) {
ENCODING *enc;
singleByteEncodingMem = malloc(XmlSizeOfSingleByteEncoding());
if (!singleByteEncodingMem)
unknownEncodingMem = malloc(XmlSizeOfUnknownEncoding());
if (!unknownEncodingMem) {
if (info.release)
info.release(info.data);
return XML_ERROR_NO_MEMORY;
enc = XmlInitSingleByteEncoding(singleByteEncodingMem, table);
}
enc = XmlInitUnknownEncoding(unknownEncodingMem,
info.map,
info.convert,
info.data);
if (enc) {
unknownEncodingData = info.data;
unknownEncodingRelease = info.release;
encoding = enc;
return XML_ERROR_NONE;
}
}
if (info.release)
info.release(info.data);
}
return XML_ERROR_UNKNOWN_ENCODING;
}

View file

@ -110,10 +110,19 @@ typedef int (*XML_ExternalEntityRefHandler)(XML_Parser parser,
const XML_Char *systemId,
const XML_Char *publicId);
typedef struct {
unsigned short map[256];
void *data;
unsigned short (*convert)(void *data, const char *s);
void (*release)(void *data);
} XML_Encoding;
typedef int (*XML_SingleByteEncodingHandler)(void *userData,
const XML_Char *encoding,
unsigned short *table);
/* The encodingHandlerData passed to this call is that which was passed as the
second argument to XML_SetUnknownEncodingHandler. */
typedef int (*XML_UnknownEncodingHandler)(void *encodingHandlerData,
const XML_Char *name,
XML_Encoding *info);
void XMLPARSEAPI
XML_SetElementHandler(XML_Parser parser,
@ -141,8 +150,9 @@ XML_SetExternalEntityRefHandler(XML_Parser parser,
XML_ExternalEntityRefHandler handler);
void XMLPARSEAPI
XML_SetSingleByteEncodingHandler(XML_Parser parser,
XML_SingleByteEncodingHandler handler);
XML_SetUnknownEncodingHandler(XML_Parser parser,
XML_UnknownEncodingHandler handler,
void *encodingHandlerData);
/* This value is passed as the userData argument to callbacks. */
void XMLPARSEAPI

View file

@ -78,11 +78,79 @@ We need 8 bits to index into pages, 3 bits to add to that index and
#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
static
int isNever(const ENCODING *enc, const char *p)
{
return 0;
}
static
int utf8_isName2(const ENCODING *enc, const char *p)
{
return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
}
static
int utf8_isName3(const ENCODING *enc, const char *p)
{
return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
}
#define utf8_isName4 isNever
static
int utf8_isNmstrt2(const ENCODING *enc, const char *p)
{
return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
}
static
int utf8_isNmstrt3(const ENCODING *enc, const char *p)
{
return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
}
#define utf8_isNmstrt4 isNever
#define utf8_isInvalid2 isNever
static
int utf8_isInvalid3(const ENCODING *enc, const char *p)
{
return UTF8_INVALID3((const unsigned char *)p);
}
static
int utf8_isInvalid4(const ENCODING *enc, const char *p)
{
return UTF8_INVALID4((const unsigned char *)p);
}
struct normal_encoding {
ENCODING enc;
unsigned char type[256];
int (*isName2)(const ENCODING *, const char *);
int (*isName3)(const ENCODING *, const char *);
int (*isName4)(const ENCODING *, const char *);
int (*isNmstrt2)(const ENCODING *, const char *);
int (*isNmstrt3)(const ENCODING *, const char *);
int (*isNmstrt4)(const ENCODING *, const char *);
int (*isInvalid2)(const ENCODING *, const char *);
int (*isInvalid3)(const ENCODING *, const char *);
int (*isInvalid4)(const ENCODING *, const char *);
};
#define NORMAL_VTABLE(E) \
E ## isName2, \
E ## isName3, \
E ## isName4, \
E ## isNmstrt2, \
E ## isNmstrt3, \
E ## isNmstrt4, \
E ## isInvalid2, \
E ## isInvalid3, \
E ## isInvalid4
static int checkCharRefNumber(int);
#include "xmltok_impl.h"
@ -92,12 +160,16 @@ static int checkCharRefNumber(int);
#define BYTE_TYPE(enc, p) \
(((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
#define BYTE_TO_ASCII(enc, p) (*p)
#define IS_NAME_CHAR(enc, p, n) UTF8_GET_NAMING(namePages, p, n)
#define IS_NMSTRT_CHAR(enc, p, n) UTF8_GET_NAMING(nmstrtPages, p, n)
#define IS_NAME_CHAR(enc, p, n) \
(((const struct normal_encoding *)(enc))->isName ## n(enc, p))
#define IS_NMSTRT_CHAR(enc, p, n) \
(((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
#define IS_INVALID_CHAR(enc, p, n) \
((n) == 3 \
? UTF8_INVALID3((const unsigned char *)(p)) \
: ((n) == 4 ? UTF8_INVALID4((const unsigned char *)(p)) : 0))
(((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
#define IS_NAME_CHAR_MINBPC(enc, p) (0)
#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
/* c is an ASCII character */
#define CHAR_MATCHES(enc, p, c) (*(p) == c)
@ -110,7 +182,9 @@ static int checkCharRefNumber(int);
#undef BYTE_TO_ASCII
#undef CHAR_MATCHES
#undef IS_NAME_CHAR
#undef IS_NAME_CHAR_MINBPC
#undef IS_NMSTRT_CHAR
#undef IS_NMSTRT_CHAR_MINBPC
#undef IS_INVALID_CHAR
enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
@ -183,7 +257,8 @@ static const struct normal_encoding utf8_encoding = {
{
#include "asciitab.h"
#include "utf8tab.h"
}
},
NORMAL_VTABLE(utf8_)
};
static const struct normal_encoding internal_utf8_encoding = {
@ -191,7 +266,8 @@ static const struct normal_encoding internal_utf8_encoding = {
{
#include "iasciitab.h"
#include "utf8tab.h"
}
},
NORMAL_VTABLE(utf8_)
};
static
@ -358,9 +434,11 @@ void PREFIX(toUtf16)(const ENCODING *enc, \
: unicode_byte_type((p)[1], (p)[0]))
#define BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
#define CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
#define IS_NAME_CHAR(enc, p, n) \
#define IS_NAME_CHAR(enc, p, n) (0)
#define IS_NAME_CHAR_MINBPC(enc, p) \
UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
#define IS_NMSTRT_CHAR(enc, p, n) \
#define IS_NMSTRT_CHAR(enc, p, n) (0)
#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
#include "xmltok_impl.c"
@ -381,7 +459,9 @@ DEFINE_UTF16_TO_UTF16
#undef BYTE_TO_ASCII
#undef CHAR_MATCHES
#undef IS_NAME_CHAR
#undef IS_NAME_CHAR_MINBPC
#undef IS_NMSTRT_CHAR
#undef IS_NMSTRT_CHAR_MINBPC
#undef IS_INVALID_CHAR
static const struct normal_encoding little2_encoding = {
@ -417,9 +497,11 @@ static const struct normal_encoding internal_little2_encoding = {
: unicode_byte_type((p)[0], (p)[1]))
#define BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
#define CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
#define IS_NAME_CHAR(enc, p, n) \
#define IS_NAME_CHAR(enc, p, n) 0
#define IS_NAME_CHAR_MINBPC(enc, p) \
UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
#define IS_NMSTRT_CHAR(enc, p, n) \
#define IS_NMSTRT_CHAR(enc, p, n) (0)
#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
#include "xmltok_impl.c"
@ -440,7 +522,9 @@ DEFINE_UTF16_TO_UTF16
#undef BYTE_TO_ASCII
#undef CHAR_MATCHES
#undef IS_NAME_CHAR
#undef IS_NAME_CHAR_MINBPC
#undef IS_NMSTRT_CHAR
#undef IS_NMSTRT_CHAR_MINBPC
#undef IS_INVALID_CHAR
static const struct normal_encoding big2_encoding = {
@ -876,51 +960,105 @@ int XmlUtf16Encode(int charNum, unsigned short *buf)
return 0;
}
struct single_encoding {
struct unknown_encoding {
struct normal_encoding normal;
unsigned short (*convert)(void *userData, const char *p);
void *userData;
unsigned short utf16[256];
unsigned char utf8[256][4];
};
int XmlSizeOfSingleByteEncoding()
int XmlSizeOfUnknownEncoding()
{
return sizeof(struct single_encoding);
return sizeof(struct unknown_encoding);
}
static
void single_toUtf8(const ENCODING *enc,
const char **fromP, const char *fromLim,
char **toP, const char *toLim)
int unknown_isName(const ENCODING *enc, const char *p)
{
unsigned short c = ((const struct unknown_encoding *)enc)
->convert(((const struct unknown_encoding *)enc)->userData, p);
return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
}
static
int unknown_isNmstrt(const ENCODING *enc, const char *p)
{
unsigned short c = ((const struct unknown_encoding *)enc)
->convert(((const struct unknown_encoding *)enc)->userData, p);
return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
}
static
int unknown_isInvalid(const ENCODING *enc, const char *p)
{
return ((const struct unknown_encoding *)enc)
->convert(((const struct unknown_encoding *)enc)->userData, p) == 0;
}
static
void unknown_toUtf8(const ENCODING *enc,
const char **fromP, const char *fromLim,
char **toP, const char *toLim)
{
char buf[XML_UTF8_ENCODE_MAX];
for (;;) {
const unsigned char *utf8;
int n;
if (*fromP == fromLim)
break;
utf8 = ((const struct single_encoding *)enc)->utf8[(unsigned char)**fromP];
utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
n = *utf8++;
if (n > toLim - *toP)
break;
if (n == 0) {
unsigned short c
= ((const struct unknown_encoding *)enc)
->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
n = XmlUtf8Encode(c, buf);
if (n > toLim - *toP)
break;
utf8 = buf;
*fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
- (BT_LEAD2 - 2);
}
else {
if (n > toLim - *toP)
break;
(*fromP)++;
}
do {
*(*toP)++ = *utf8++;
} while (--n != 0);
(*fromP)++;
}
}
static
void single_toUtf16(const ENCODING *enc,
const char **fromP, const char *fromLim,
unsigned short **toP, const unsigned short *toLim)
void unknown_toUtf16(const ENCODING *enc,
const char **fromP, const char *fromLim,
unsigned short **toP, const unsigned short *toLim)
{
while (*fromP != fromLim && *toP != toLim)
*(*toP)++ = ((const struct single_encoding *)enc)->utf16[(unsigned char)*(*fromP)++];
while (*fromP != fromLim && *toP != toLim) {
unsigned short c
= ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
if (c == 0) {
c = ((const struct unknown_encoding *)enc)
->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
*fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
- (BT_LEAD2 - 2);
}
else
(*fromP)++;
*(*toP)++ = c;
}
}
ENCODING *XmlInitSingleByteEncoding(void *mem, unsigned short *table)
ENCODING *
XmlInitUnknownEncoding(void *mem,
unsigned short *table,
unsigned short (*convert)(void *userData, const char *p),
void *userData)
{
int i;
struct single_encoding *e = mem;
struct unknown_encoding *e = mem;
for (i = 0; i < sizeof(struct normal_encoding); i++)
((char *)mem)[i] = ((char *)&latin1_encoding)[i];
for (i = 0; i < 128; i++)
@ -935,12 +1073,24 @@ ENCODING *XmlInitSingleByteEncoding(void *mem, unsigned short *table)
&& latin1_encoding.type[c] != BT_NONXML
&& c != i)
return 0;
e->normal.type[i] = latin1_encoding.type[c];
e->utf8[i][0] = 1;
e->utf8[i][1] = (char)c;
if (c >= 2 && c <= 4) {
e->normal.type[i] = BT_LEAD2 + (c - 2);
e->utf8[i][0] = 0;
e->utf16[i] = 0;
}
else {
e->normal.type[i] = latin1_encoding.type[c];
e->utf8[i][0] = 1;
e->utf8[i][1] = (char)c;
e->utf16[i] = c == 0 ? 0xFFFF : c;
}
}
else if (checkCharRefNumber(c) < 0)
else if (checkCharRefNumber(c) < 0) {
e->normal.type[i] = BT_NONXML;
e->utf16[i] = 0xFFFF;
e->utf8[i][0] = 1;
e->utf8[i][1] = 0;
}
else {
if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
e->normal.type[i] = BT_NMSTRT;
@ -949,10 +1099,23 @@ ENCODING *XmlInitSingleByteEncoding(void *mem, unsigned short *table)
else
e->normal.type[i] = BT_OTHER;
e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
e->utf16[i] = c;
}
e->utf16[i] = c;
}
e->normal.enc.utf8Convert = single_toUtf8;
e->normal.enc.utf16Convert = single_toUtf16;
e->userData = userData;
e->convert = convert;
if (convert) {
e->normal.isName2 = unknown_isName;
e->normal.isName3 = unknown_isName;
e->normal.isName4 = unknown_isName;
e->normal.isNmstrt2 = unknown_isNmstrt;
e->normal.isNmstrt3 = unknown_isNmstrt;
e->normal.isNmstrt4 = unknown_isNmstrt;
e->normal.isInvalid2 = unknown_isInvalid;
e->normal.isInvalid3 = unknown_isInvalid;
e->normal.isInvalid4 = unknown_isInvalid;
}
e->normal.enc.utf8Convert = unknown_toUtf8;
e->normal.enc.utf16Convert = unknown_toUtf16;
return &(e->normal.enc);
}

View file

@ -262,8 +262,12 @@ const ENCODING XMLTOKAPI *XmlGetUtf16InternalEncoding();
int XMLTOKAPI XmlUtf8Encode(int charNumber, char *buf);
int XMLTOKAPI XmlUtf16Encode(int charNumber, unsigned short *buf);
int XMLTOKAPI XmlSizeOfSingleByteEncoding();
ENCODING XMLTOKAPI *XmlInitSingleByteEncoding(void *mem, unsigned short *table);
int XMLTOKAPI XmlSizeOfUnknownEncoding();
ENCODING XMLTOKAPI *
XmlInitUnknownEncoding(void *mem,
unsigned short *table,
unsigned short (*convert)(void *userData, const char *p),
void *userData);
#ifdef __cplusplus
}

View file

@ -56,7 +56,7 @@ Contributor(s):
#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
case BT_NONASCII: \
if (!IS_NAME_CHAR(enc, ptr, MINBPC)) { \
if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
*nextTokPtr = ptr; \
return XML_TOK_INVALID; \
} \
@ -84,7 +84,7 @@ Contributor(s):
#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
case BT_NONASCII: \
if (!IS_NMSTRT_CHAR(enc, ptr, MINBPC)) { \
if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
*nextTokPtr = ptr; \
return XML_TOK_INVALID; \
} \
@ -1082,12 +1082,12 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
ptr += MINBPC;
break;
case BT_NONASCII:
if (IS_NMSTRT_CHAR(enc, ptr, MINBPC)) {
if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
ptr += MINBPC;
tok = XML_TOK_NAME;
break;
}
if (IS_NAME_CHAR(enc, ptr, MINBPC)) {
if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
ptr += MINBPC;
tok = XML_TOK_NMTOKEN;
break;

View file

@ -23,24 +23,52 @@ Contributor(s):
#ifdef WIN32
#include <windows.h>
int codepage(int cp, unsigned short *map)
int codepageMap(int cp, unsigned short *map)
{
int i;
CPINFO info;
if (!GetCPInfo(cp, &info) || info.MaxCharSize > 1)
if (!GetCPInfo(cp, &info) || info.MaxCharSize > 2)
return 0;
for (i = 0; i < 256; i++)
map[i] = 0;
if (info.MaxCharSize > 1) {
for (i = 0; i < MAX_LEADBYTES; i++) {
int j, lim;
if (info.LeadByte[i] == 0 && info.LeadByte[i + 1] == 0)
break;
lim = info.LeadByte[i + 1];
for (j = info.LeadByte[i]; j < lim; j++)
map[j] = 2;
}
}
for (i = 0; i < 256; i++) {
char c = i;
if (MultiByteToWideChar(cp, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
&c, 1, map + i, 1) == 0)
map[i] = 0;
if (map[i] == 0) {
char c = i;
if (MultiByteToWideChar(cp, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
&c, 1, map + i, 1) == 0)
map[i] = 0;
}
}
return 1;
}
unsigned short codepageConvert(int cp, const char *p)
{
unsigned short c;
if (MultiByteToWideChar(cp, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
p, 2, &c, 1) == 1)
return c;
return 0;
}
#else /* not WIN32 */
int codepage(int cp, unsigned short *map)
int codepageMap(int cp, unsigned short *map)
{
return 0;
}
unsigned short codepageConvert(int cp, const char *p)
{
return 0;
}

View file

@ -18,4 +18,5 @@ James Clark. All Rights Reserved.
Contributor(s):
*/
int codepage(int cp, unsigned short *map);
int codepageMap(int cp, unsigned short *map);
unsigned short codepageConvert(int cp, const char *p);

View file

@ -369,9 +369,15 @@ int externalEntityRefStream(XML_Parser parser,
}
static
int singleByteEncoding(void *userData,
const XML_Char *encoding,
unsigned short *table)
unsigned short unknownEncodingConvert(void *data, const char *p)
{
return codepageConvert(*(int *)data, p);
}
static
int unknownEncoding(void *userData,
const XML_Char *name,
XML_Encoding *info)
{
int cp;
static const XML_Char prefixL[] = T("windows-");
@ -379,13 +385,13 @@ int singleByteEncoding(void *userData,
int i;
for (i = 0; prefixU[i]; i++)
if (encoding[i] != prefixU[i] && encoding[i] != prefixL[i])
if (name[i] != prefixU[i] && name[i] != prefixL[i])
return 0;
cp = 0;
for (; encoding[i]; i++) {
for (; name[i]; i++) {
static const XML_Char digits[] = T("0123456789");
const XML_Char *s = tcschr(digits, encoding[i]);
const XML_Char *s = tcschr(digits, name[i]);
if (!s)
return 0;
cp *= 10;
@ -393,7 +399,17 @@ int singleByteEncoding(void *userData,
if (cp >= 0x10000)
return 0;
}
return codepage(cp, table);
if (!codepageMap(cp, info->map))
return 0;
info->convert = unknownEncodingConvert;
/* We could just cast the code page integer to a void *,
and avoid the use of release. */
info->release = free;
info->data = malloc(sizeof(int));
if (!info->data)
return 0;
*(int *)info->data = cp;
return 1;
}
static
@ -498,7 +514,7 @@ int tmain(int argc, XML_Char **argv)
#endif
}
if (windowsCodePages)
XML_SetSingleByteEncodingHandler(parser, singleByteEncoding);
XML_SetUnknownEncodingHandler(parser, unknownEncoding, 0);
if (processExternalEntities) {
if (!XML_SetBase(parser, argv[i])) {
ftprintf(stderr, T("%s: out of memory"), argv[0]);