From 84835ac373b2b82260078286d6c09e155565d502 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 2 Jun 1998 08:57:14 +0000 Subject: [PATCH] Generalize unknown encoding support --- expat/xmlparse/xmlparse.c | 65 ++++++++--- expat/xmlparse/xmlparse.h | 20 +++- expat/xmltok/xmltok.c | 233 +++++++++++++++++++++++++++++++------ expat/xmltok/xmltok.h | 8 +- expat/xmltok/xmltok_impl.c | 8 +- expat/xmlwf/codepage.c | 42 +++++-- expat/xmlwf/codepage.h | 3 +- expat/xmlwf/xmlwf.c | 32 +++-- 8 files changed, 330 insertions(+), 81 deletions(-) diff --git a/expat/xmlparse/xmlparse.c b/expat/xmlparse/xmlparse.c index 72fa5dc3..a408e108 100755 --- a/expat/xmlparse/xmlparse.c +++ b/expat/xmlparse/xmlparse.c @@ -215,11 +215,14 @@ typedef struct { XML_UnparsedEntityDeclHandler unparsedEntityDeclHandler; XML_NotationDeclHandler notationDeclHandler; XML_ExternalEntityRefHandler externalEntityRefHandler; - XML_SingleByteEncodingHandler singleByteEncodingHandler; + XML_UnknownEncodingHandler unknownEncodingHandler; const ENCODING *encoding; INIT_ENCODING initEncoding; const XML_Char *protocolEncodingName; - void *singleByteEncodingMem; + void *unknownEncodingMem; + void *unknownEncodingData; + void *unknownEncodingHandlerData; + void (*unknownEncodingRelease)(void *); PROLOG_STATE prologState; Processor *processor; enum XML_Error errorCode; @@ -253,10 +256,14 @@ typedef struct { #define unparsedEntityDeclHandler (((Parser *)parser)->unparsedEntityDeclHandler) #define notationDeclHandler (((Parser *)parser)->notationDeclHandler) #define externalEntityRefHandler (((Parser *)parser)->externalEntityRefHandler) -#define singleByteEncodingHandler (((Parser *)parser)->singleByteEncodingHandler) +#define unknownEncodingHandler (((Parser *)parser)->unknownEncodingHandler) #define encoding (((Parser *)parser)->encoding) #define initEncoding (((Parser *)parser)->initEncoding) -#define singleByteEncodingMem (((Parser *)parser)->singleByteEncodingMem) +#define unknownEncodingMem (((Parser *)parser)->unknownEncodingMem) +#define unknownEncodingData (((Parser *)parser)->unknownEncodingData) +#define unknownEncodingHandlerData \ + (((Parser *)parser)->unknownEncodingHandlerData) +#define unknownEncodingRelease (((Parser *)parser)->unknownEncodingRelease) #define protocolEncodingName (((Parser *)parser)->protocolEncodingName) #define prologState (((Parser *)parser)->prologState) #define processor (((Parser *)parser)->processor) @@ -304,7 +311,7 @@ XML_Parser XML_ParserCreate(const XML_Char *encodingName) unparsedEntityDeclHandler = 0; notationDeclHandler = 0; externalEntityRefHandler = 0; - singleByteEncodingHandler = 0; + unknownEncodingHandler = 0; buffer = 0; bufferPtr = 0; bufferEnd = 0; @@ -328,7 +335,10 @@ XML_Parser XML_ParserCreate(const XML_Char *encodingName) groupSize = 0; groupConnector = 0; hadExternalDoctype = 0; - singleByteEncodingMem = 0; + unknownEncodingMem = 0; + unknownEncodingRelease = 0; + unknownEncodingData = 0; + unknownEncodingHandlerData = 0; poolInit(&tempPool); poolInit(&temp2Pool); protocolEncodingName = encodingName ? poolCopyString(&tempPool, encodingName) : 0; @@ -353,7 +363,7 @@ XML_Parser XML_ExternalEntityParserCreate(XML_Parser oldParser, XML_CharacterDataHandler oldCharacterDataHandler = characterDataHandler; XML_ProcessingInstructionHandler oldProcessingInstructionHandler = processingInstructionHandler; XML_ExternalEntityRefHandler oldExternalEntityRefHandler = externalEntityRefHandler; - XML_SingleByteEncodingHandler oldSingleByteEncodingHandler = singleByteEncodingHandler; + XML_UnknownEncodingHandler oldUnknownEncodingHandler = unknownEncodingHandler; void *oldUserData = userData; parser = XML_ParserCreate(encodingName); @@ -364,7 +374,7 @@ XML_Parser XML_ExternalEntityParserCreate(XML_Parser oldParser, characterDataHandler = oldCharacterDataHandler; processingInstructionHandler = oldProcessingInstructionHandler; externalEntityRefHandler = oldExternalEntityRefHandler; - singleByteEncodingHandler = oldSingleByteEncodingHandler; + unknownEncodingHandler = oldUnknownEncodingHandler; userData = oldUserData; if (!dtdCopy(&dtd, oldDtd) || !setOpenEntityNames(parser, openEntityNames)) { XML_ParserFree(parser); @@ -396,7 +406,9 @@ void XML_ParserFree(XML_Parser parser) free(groupConnector); free(buffer); free(dataBuf); - free(singleByteEncodingMem); + free(unknownEncodingMem); + if (unknownEncodingRelease) + unknownEncodingRelease(unknownEncodingData); free(parser); } @@ -461,10 +473,12 @@ void XML_SetExternalEntityRefHandler(XML_Parser parser, externalEntityRefHandler = handler; } -void XML_SetSingleByteEncodingHandler(XML_Parser parser, - XML_SingleByteEncodingHandler handler) +void XML_SetUnknownEncodingHandler(XML_Parser parser, + XML_UnknownEncodingHandler handler, + void *data) { - singleByteEncodingHandler = handler; + unknownEncodingHandler = handler; + unknownEncodingHandlerData = data; } int XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) @@ -1331,22 +1345,35 @@ processXmlDecl(XML_Parser parser, int isGeneralTextEntity, static enum XML_Error handleUnknownEncoding(XML_Parser parser, const XML_Char *encodingName) { - if (singleByteEncodingHandler) { - unsigned short table[256]; + if (unknownEncodingHandler) { + XML_Encoding info; int i; for (i = 0; i < 256; i++) - table[i] = 0; - if (singleByteEncodingHandler(userData, encodingName, table)) { + info.map[i] = 0; + info.convert = 0; + info.data = 0; + info.release = 0; + if (unknownEncodingHandler(unknownEncodingHandlerData, encodingName, &info)) { ENCODING *enc; - singleByteEncodingMem = malloc(XmlSizeOfSingleByteEncoding()); - if (!singleByteEncodingMem) + unknownEncodingMem = malloc(XmlSizeOfUnknownEncoding()); + if (!unknownEncodingMem) { + if (info.release) + info.release(info.data); return XML_ERROR_NO_MEMORY; - enc = XmlInitSingleByteEncoding(singleByteEncodingMem, table); + } + enc = XmlInitUnknownEncoding(unknownEncodingMem, + info.map, + info.convert, + info.data); if (enc) { + unknownEncodingData = info.data; + unknownEncodingRelease = info.release; encoding = enc; return XML_ERROR_NONE; } } + if (info.release) + info.release(info.data); } return XML_ERROR_UNKNOWN_ENCODING; } diff --git a/expat/xmlparse/xmlparse.h b/expat/xmlparse/xmlparse.h index 7d29c50a..62c27107 100755 --- a/expat/xmlparse/xmlparse.h +++ b/expat/xmlparse/xmlparse.h @@ -110,10 +110,19 @@ typedef int (*XML_ExternalEntityRefHandler)(XML_Parser parser, const XML_Char *systemId, const XML_Char *publicId); +typedef struct { + unsigned short map[256]; + void *data; + unsigned short (*convert)(void *data, const char *s); + void (*release)(void *data); +} XML_Encoding; -typedef int (*XML_SingleByteEncodingHandler)(void *userData, - const XML_Char *encoding, - unsigned short *table); +/* The encodingHandlerData passed to this call is that which was passed as the +second argument to XML_SetUnknownEncodingHandler. */ + +typedef int (*XML_UnknownEncodingHandler)(void *encodingHandlerData, + const XML_Char *name, + XML_Encoding *info); void XMLPARSEAPI XML_SetElementHandler(XML_Parser parser, @@ -141,8 +150,9 @@ XML_SetExternalEntityRefHandler(XML_Parser parser, XML_ExternalEntityRefHandler handler); void XMLPARSEAPI -XML_SetSingleByteEncodingHandler(XML_Parser parser, - XML_SingleByteEncodingHandler handler); +XML_SetUnknownEncodingHandler(XML_Parser parser, + XML_UnknownEncodingHandler handler, + void *encodingHandlerData); /* This value is passed as the userData argument to callbacks. */ void XMLPARSEAPI diff --git a/expat/xmltok/xmltok.c b/expat/xmltok/xmltok.c index 1d63a7c3..94eaa109 100755 --- a/expat/xmltok/xmltok.c +++ b/expat/xmltok/xmltok.c @@ -78,11 +78,79 @@ We need 8 bits to index into pages, 3 bits to add to that index and #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0) +static +int isNever(const ENCODING *enc, const char *p) +{ + return 0; +} + +static +int utf8_isName2(const ENCODING *enc, const char *p) +{ + return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); +} + +static +int utf8_isName3(const ENCODING *enc, const char *p) +{ + return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); +} + +#define utf8_isName4 isNever + +static +int utf8_isNmstrt2(const ENCODING *enc, const char *p) +{ + return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); +} + +static +int utf8_isNmstrt3(const ENCODING *enc, const char *p) +{ + return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); +} + +#define utf8_isNmstrt4 isNever + +#define utf8_isInvalid2 isNever + +static +int utf8_isInvalid3(const ENCODING *enc, const char *p) +{ + return UTF8_INVALID3((const unsigned char *)p); +} + +static +int utf8_isInvalid4(const ENCODING *enc, const char *p) +{ + return UTF8_INVALID4((const unsigned char *)p); +} + struct normal_encoding { ENCODING enc; unsigned char type[256]; + int (*isName2)(const ENCODING *, const char *); + int (*isName3)(const ENCODING *, const char *); + int (*isName4)(const ENCODING *, const char *); + int (*isNmstrt2)(const ENCODING *, const char *); + int (*isNmstrt3)(const ENCODING *, const char *); + int (*isNmstrt4)(const ENCODING *, const char *); + int (*isInvalid2)(const ENCODING *, const char *); + int (*isInvalid3)(const ENCODING *, const char *); + int (*isInvalid4)(const ENCODING *, const char *); }; +#define NORMAL_VTABLE(E) \ + E ## isName2, \ + E ## isName3, \ + E ## isName4, \ + E ## isNmstrt2, \ + E ## isNmstrt3, \ + E ## isNmstrt4, \ + E ## isInvalid2, \ + E ## isInvalid3, \ + E ## isInvalid4 + static int checkCharRefNumber(int); #include "xmltok_impl.h" @@ -92,12 +160,16 @@ static int checkCharRefNumber(int); #define BYTE_TYPE(enc, p) \ (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) #define BYTE_TO_ASCII(enc, p) (*p) -#define IS_NAME_CHAR(enc, p, n) UTF8_GET_NAMING(namePages, p, n) -#define IS_NMSTRT_CHAR(enc, p, n) UTF8_GET_NAMING(nmstrtPages, p, n) + +#define IS_NAME_CHAR(enc, p, n) \ + (((const struct normal_encoding *)(enc))->isName ## n(enc, p)) +#define IS_NMSTRT_CHAR(enc, p, n) \ + (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p)) #define IS_INVALID_CHAR(enc, p, n) \ -((n) == 3 \ - ? UTF8_INVALID3((const unsigned char *)(p)) \ - : ((n) == 4 ? UTF8_INVALID4((const unsigned char *)(p)) : 0)) + (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p)) + +#define IS_NAME_CHAR_MINBPC(enc, p) (0) +#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) /* c is an ASCII character */ #define CHAR_MATCHES(enc, p, c) (*(p) == c) @@ -110,7 +182,9 @@ static int checkCharRefNumber(int); #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR +#undef IS_NAME_CHAR_MINBPC #undef IS_NMSTRT_CHAR +#undef IS_NMSTRT_CHAR_MINBPC #undef IS_INVALID_CHAR enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ @@ -183,7 +257,8 @@ static const struct normal_encoding utf8_encoding = { { #include "asciitab.h" #include "utf8tab.h" - } + }, + NORMAL_VTABLE(utf8_) }; static const struct normal_encoding internal_utf8_encoding = { @@ -191,7 +266,8 @@ static const struct normal_encoding internal_utf8_encoding = { { #include "iasciitab.h" #include "utf8tab.h" - } + }, + NORMAL_VTABLE(utf8_) }; static @@ -358,9 +434,11 @@ void PREFIX(toUtf16)(const ENCODING *enc, \ : unicode_byte_type((p)[1], (p)[0])) #define BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) #define CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) -#define IS_NAME_CHAR(enc, p, n) \ +#define IS_NAME_CHAR(enc, p, n) (0) +#define IS_NAME_CHAR_MINBPC(enc, p) \ UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) -#define IS_NMSTRT_CHAR(enc, p, n) \ +#define IS_NMSTRT_CHAR(enc, p, n) (0) +#define IS_NMSTRT_CHAR_MINBPC(enc, p) \ UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) #include "xmltok_impl.c" @@ -381,7 +459,9 @@ DEFINE_UTF16_TO_UTF16 #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR +#undef IS_NAME_CHAR_MINBPC #undef IS_NMSTRT_CHAR +#undef IS_NMSTRT_CHAR_MINBPC #undef IS_INVALID_CHAR static const struct normal_encoding little2_encoding = { @@ -417,9 +497,11 @@ static const struct normal_encoding internal_little2_encoding = { : unicode_byte_type((p)[0], (p)[1])) #define BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) #define CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) -#define IS_NAME_CHAR(enc, p, n) \ +#define IS_NAME_CHAR(enc, p, n) 0 +#define IS_NAME_CHAR_MINBPC(enc, p) \ UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) -#define IS_NMSTRT_CHAR(enc, p, n) \ +#define IS_NMSTRT_CHAR(enc, p, n) (0) +#define IS_NMSTRT_CHAR_MINBPC(enc, p) \ UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) #include "xmltok_impl.c" @@ -440,7 +522,9 @@ DEFINE_UTF16_TO_UTF16 #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR +#undef IS_NAME_CHAR_MINBPC #undef IS_NMSTRT_CHAR +#undef IS_NMSTRT_CHAR_MINBPC #undef IS_INVALID_CHAR static const struct normal_encoding big2_encoding = { @@ -876,51 +960,105 @@ int XmlUtf16Encode(int charNum, unsigned short *buf) return 0; } -struct single_encoding { +struct unknown_encoding { struct normal_encoding normal; + unsigned short (*convert)(void *userData, const char *p); + void *userData; unsigned short utf16[256]; unsigned char utf8[256][4]; }; -int XmlSizeOfSingleByteEncoding() +int XmlSizeOfUnknownEncoding() { - return sizeof(struct single_encoding); + return sizeof(struct unknown_encoding); } static -void single_toUtf8(const ENCODING *enc, - const char **fromP, const char *fromLim, - char **toP, const char *toLim) +int unknown_isName(const ENCODING *enc, const char *p) { + unsigned short c = ((const struct unknown_encoding *)enc) + ->convert(((const struct unknown_encoding *)enc)->userData, p); + return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); +} + +static +int unknown_isNmstrt(const ENCODING *enc, const char *p) +{ + unsigned short c = ((const struct unknown_encoding *)enc) + ->convert(((const struct unknown_encoding *)enc)->userData, p); + return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); +} + +static +int unknown_isInvalid(const ENCODING *enc, const char *p) +{ + return ((const struct unknown_encoding *)enc) + ->convert(((const struct unknown_encoding *)enc)->userData, p) == 0; +} + +static +void unknown_toUtf8(const ENCODING *enc, + const char **fromP, const char *fromLim, + char **toP, const char *toLim) +{ + char buf[XML_UTF8_ENCODE_MAX]; for (;;) { const unsigned char *utf8; int n; if (*fromP == fromLim) break; - utf8 = ((const struct single_encoding *)enc)->utf8[(unsigned char)**fromP]; + utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP]; n = *utf8++; - if (n > toLim - *toP) - break; + if (n == 0) { + unsigned short c + = ((const struct unknown_encoding *)enc) + ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); + n = XmlUtf8Encode(c, buf); + if (n > toLim - *toP) + break; + utf8 = buf; + *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] + - (BT_LEAD2 - 2); + } + else { + if (n > toLim - *toP) + break; + (*fromP)++; + } do { *(*toP)++ = *utf8++; } while (--n != 0); - (*fromP)++; } } static -void single_toUtf16(const ENCODING *enc, - const char **fromP, const char *fromLim, - unsigned short **toP, const unsigned short *toLim) +void unknown_toUtf16(const ENCODING *enc, + const char **fromP, const char *fromLim, + unsigned short **toP, const unsigned short *toLim) { - while (*fromP != fromLim && *toP != toLim) - *(*toP)++ = ((const struct single_encoding *)enc)->utf16[(unsigned char)*(*fromP)++]; + while (*fromP != fromLim && *toP != toLim) { + unsigned short c + = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP]; + if (c == 0) { + c = ((const struct unknown_encoding *)enc) + ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); + *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] + - (BT_LEAD2 - 2); + } + else + (*fromP)++; + *(*toP)++ = c; + } } -ENCODING *XmlInitSingleByteEncoding(void *mem, unsigned short *table) +ENCODING * +XmlInitUnknownEncoding(void *mem, + unsigned short *table, + unsigned short (*convert)(void *userData, const char *p), + void *userData) { int i; - struct single_encoding *e = mem; + struct unknown_encoding *e = mem; for (i = 0; i < sizeof(struct normal_encoding); i++) ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; for (i = 0; i < 128; i++) @@ -935,12 +1073,24 @@ ENCODING *XmlInitSingleByteEncoding(void *mem, unsigned short *table) && latin1_encoding.type[c] != BT_NONXML && c != i) return 0; - e->normal.type[i] = latin1_encoding.type[c]; - e->utf8[i][0] = 1; - e->utf8[i][1] = (char)c; + if (c >= 2 && c <= 4) { + e->normal.type[i] = BT_LEAD2 + (c - 2); + e->utf8[i][0] = 0; + e->utf16[i] = 0; + } + else { + e->normal.type[i] = latin1_encoding.type[c]; + e->utf8[i][0] = 1; + e->utf8[i][1] = (char)c; + e->utf16[i] = c == 0 ? 0xFFFF : c; + } } - else if (checkCharRefNumber(c) < 0) + else if (checkCharRefNumber(c) < 0) { e->normal.type[i] = BT_NONXML; + e->utf16[i] = 0xFFFF; + e->utf8[i][0] = 1; + e->utf8[i][1] = 0; + } else { if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) e->normal.type[i] = BT_NMSTRT; @@ -949,10 +1099,23 @@ ENCODING *XmlInitSingleByteEncoding(void *mem, unsigned short *table) else e->normal.type[i] = BT_OTHER; e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); + e->utf16[i] = c; } - e->utf16[i] = c; } - e->normal.enc.utf8Convert = single_toUtf8; - e->normal.enc.utf16Convert = single_toUtf16; + e->userData = userData; + e->convert = convert; + if (convert) { + e->normal.isName2 = unknown_isName; + e->normal.isName3 = unknown_isName; + e->normal.isName4 = unknown_isName; + e->normal.isNmstrt2 = unknown_isNmstrt; + e->normal.isNmstrt3 = unknown_isNmstrt; + e->normal.isNmstrt4 = unknown_isNmstrt; + e->normal.isInvalid2 = unknown_isInvalid; + e->normal.isInvalid3 = unknown_isInvalid; + e->normal.isInvalid4 = unknown_isInvalid; + } + e->normal.enc.utf8Convert = unknown_toUtf8; + e->normal.enc.utf16Convert = unknown_toUtf16; return &(e->normal.enc); } diff --git a/expat/xmltok/xmltok.h b/expat/xmltok/xmltok.h index f421a1ca..f28414a9 100755 --- a/expat/xmltok/xmltok.h +++ b/expat/xmltok/xmltok.h @@ -262,8 +262,12 @@ const ENCODING XMLTOKAPI *XmlGetUtf16InternalEncoding(); int XMLTOKAPI XmlUtf8Encode(int charNumber, char *buf); int XMLTOKAPI XmlUtf16Encode(int charNumber, unsigned short *buf); -int XMLTOKAPI XmlSizeOfSingleByteEncoding(); -ENCODING XMLTOKAPI *XmlInitSingleByteEncoding(void *mem, unsigned short *table); +int XMLTOKAPI XmlSizeOfUnknownEncoding(); +ENCODING XMLTOKAPI * +XmlInitUnknownEncoding(void *mem, + unsigned short *table, + unsigned short (*convert)(void *userData, const char *p), + void *userData); #ifdef __cplusplus } diff --git a/expat/xmltok/xmltok_impl.c b/expat/xmltok/xmltok_impl.c index aac95624..513935ae 100755 --- a/expat/xmltok/xmltok_impl.c +++ b/expat/xmltok/xmltok_impl.c @@ -56,7 +56,7 @@ Contributor(s): #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \ case BT_NONASCII: \ - if (!IS_NAME_CHAR(enc, ptr, MINBPC)) { \ + if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \ *nextTokPtr = ptr; \ return XML_TOK_INVALID; \ } \ @@ -84,7 +84,7 @@ Contributor(s): #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \ case BT_NONASCII: \ - if (!IS_NMSTRT_CHAR(enc, ptr, MINBPC)) { \ + if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \ *nextTokPtr = ptr; \ return XML_TOK_INVALID; \ } \ @@ -1082,12 +1082,12 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, ptr += MINBPC; break; case BT_NONASCII: - if (IS_NMSTRT_CHAR(enc, ptr, MINBPC)) { + if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { ptr += MINBPC; tok = XML_TOK_NAME; break; } - if (IS_NAME_CHAR(enc, ptr, MINBPC)) { + if (IS_NAME_CHAR_MINBPC(enc, ptr)) { ptr += MINBPC; tok = XML_TOK_NMTOKEN; break; diff --git a/expat/xmlwf/codepage.c b/expat/xmlwf/codepage.c index 07cd232a..8144e954 100755 --- a/expat/xmlwf/codepage.c +++ b/expat/xmlwf/codepage.c @@ -23,24 +23,52 @@ Contributor(s): #ifdef WIN32 #include -int codepage(int cp, unsigned short *map) +int codepageMap(int cp, unsigned short *map) { int i; CPINFO info; - if (!GetCPInfo(cp, &info) || info.MaxCharSize > 1) + if (!GetCPInfo(cp, &info) || info.MaxCharSize > 2) return 0; + for (i = 0; i < 256; i++) + map[i] = 0; + if (info.MaxCharSize > 1) { + for (i = 0; i < MAX_LEADBYTES; i++) { + int j, lim; + if (info.LeadByte[i] == 0 && info.LeadByte[i + 1] == 0) + break; + lim = info.LeadByte[i + 1]; + for (j = info.LeadByte[i]; j < lim; j++) + map[j] = 2; + } + } for (i = 0; i < 256; i++) { - char c = i; - if (MultiByteToWideChar(cp, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, - &c, 1, map + i, 1) == 0) - map[i] = 0; + if (map[i] == 0) { + char c = i; + if (MultiByteToWideChar(cp, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, + &c, 1, map + i, 1) == 0) + map[i] = 0; + } } return 1; } +unsigned short codepageConvert(int cp, const char *p) +{ + unsigned short c; + if (MultiByteToWideChar(cp, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, + p, 2, &c, 1) == 1) + return c; + return 0; +} + #else /* not WIN32 */ -int codepage(int cp, unsigned short *map) +int codepageMap(int cp, unsigned short *map) +{ + return 0; +} + +unsigned short codepageConvert(int cp, const char *p) { return 0; } diff --git a/expat/xmlwf/codepage.h b/expat/xmlwf/codepage.h index 3f10cc52..a8563276 100755 --- a/expat/xmlwf/codepage.h +++ b/expat/xmlwf/codepage.h @@ -18,4 +18,5 @@ James Clark. All Rights Reserved. Contributor(s): */ -int codepage(int cp, unsigned short *map); +int codepageMap(int cp, unsigned short *map); +unsigned short codepageConvert(int cp, const char *p); diff --git a/expat/xmlwf/xmlwf.c b/expat/xmlwf/xmlwf.c index c157a386..bb0d43d3 100755 --- a/expat/xmlwf/xmlwf.c +++ b/expat/xmlwf/xmlwf.c @@ -369,9 +369,15 @@ int externalEntityRefStream(XML_Parser parser, } static -int singleByteEncoding(void *userData, - const XML_Char *encoding, - unsigned short *table) +unsigned short unknownEncodingConvert(void *data, const char *p) +{ + return codepageConvert(*(int *)data, p); +} + +static +int unknownEncoding(void *userData, + const XML_Char *name, + XML_Encoding *info) { int cp; static const XML_Char prefixL[] = T("windows-"); @@ -379,13 +385,13 @@ int singleByteEncoding(void *userData, int i; for (i = 0; prefixU[i]; i++) - if (encoding[i] != prefixU[i] && encoding[i] != prefixL[i]) + if (name[i] != prefixU[i] && name[i] != prefixL[i]) return 0; cp = 0; - for (; encoding[i]; i++) { + for (; name[i]; i++) { static const XML_Char digits[] = T("0123456789"); - const XML_Char *s = tcschr(digits, encoding[i]); + const XML_Char *s = tcschr(digits, name[i]); if (!s) return 0; cp *= 10; @@ -393,7 +399,17 @@ int singleByteEncoding(void *userData, if (cp >= 0x10000) return 0; } - return codepage(cp, table); + if (!codepageMap(cp, info->map)) + return 0; + info->convert = unknownEncodingConvert; + /* We could just cast the code page integer to a void *, + and avoid the use of release. */ + info->release = free; + info->data = malloc(sizeof(int)); + if (!info->data) + return 0; + *(int *)info->data = cp; + return 1; } static @@ -498,7 +514,7 @@ int tmain(int argc, XML_Char **argv) #endif } if (windowsCodePages) - XML_SetSingleByteEncodingHandler(parser, singleByteEncoding); + XML_SetUnknownEncodingHandler(parser, unknownEncoding, 0); if (processExternalEntities) { if (!XML_SetBase(parser, argv[i])) { ftprintf(stderr, T("%s: out of memory"), argv[0]);