From 583f937102f5f95e41dbd0e6f4ffb9d3323db3aa Mon Sep 17 00:00:00 2001 From: James Clark Date: Sat, 30 May 1998 10:13:07 +0000 Subject: [PATCH] Change handling of internal encodings in xmltok interface. --- expat/xmlparse/xmlparse.c | 26 ++--- expat/xmltok/xmltok.c | 214 +++++++++++++++++++------------------- expat/xmltok/xmltok.h | 40 +++---- 3 files changed, 142 insertions(+), 138 deletions(-) diff --git a/expat/xmlparse/xmlparse.c b/expat/xmlparse/xmlparse.c index b26a73dc..6b603444 100755 --- a/expat/xmlparse/xmlparse.c +++ b/expat/xmlparse/xmlparse.c @@ -673,7 +673,7 @@ doContent(XML_Parser parser, const char *end, const char **nextPtr) { - const ENCODING *utf8 = XmlGetInternalEncoding(XML_UTF8_ENCODING); + const ENCODING *utf8 = XmlGetUtf8InternalEncoding(); for (;;) { const char *next; int tok = XmlContentTok(enc, s, end, &next); @@ -839,9 +839,9 @@ doContent(XML_Parser parser, if (nextPtr) toPtr += tag->rawNameLength; tag->name = toPtr; - XmlConvert(enc, XML_UTF8_ENCODING, - &fromPtr, rawNameEnd, - &toPtr, tag->bufEnd - 1); + XmlUtf8Convert(enc, + &fromPtr, rawNameEnd, + &toPtr, tag->bufEnd - 1); if (fromPtr == rawNameEnd) break; bufSize = (tag->bufEnd - tag->buf) << 1; @@ -937,7 +937,7 @@ doContent(XML_Parser parser, } if (characterDataHandler) { char buf[XML_MAX_BYTES_PER_CHAR]; - characterDataHandler(userData, buf, XmlEncode(utf8, n, buf)); + characterDataHandler(userData, buf, XmlUtf8Encode(n, buf)); } } break; @@ -966,7 +966,7 @@ doContent(XML_Parser parser, } if (characterDataHandler) { char *dataPtr = dataBuf; - XmlConvert(enc, XML_UTF8_ENCODING, &s, end, &dataPtr, dataBufEnd); + XmlUtf8Convert(enc, &s, end, &dataPtr, dataBufEnd); characterDataHandler(userData, dataBuf, dataPtr - dataBuf); } if (startTagLevel == 0) { @@ -982,7 +982,7 @@ doContent(XML_Parser parser, if (characterDataHandler) { do { char *dataPtr = dataBuf; - XmlConvert(enc, XML_UTF8_ENCODING, &s, next, &dataPtr, dataBufEnd); + XmlUtf8Convert(enc, &s, next, &dataPtr, dataBufEnd); characterDataHandler(userData, dataBuf, dataPtr - dataBuf); } while (s != next); } @@ -1134,7 +1134,7 @@ enum XML_Error doCdataSection(XML_Parser parser, if (characterDataHandler) { do { char *dataPtr = dataBuf; - XmlConvert(encoding, XML_UTF8_ENCODING, &s, next, &dataPtr, dataBufEnd); + XmlUtf8Convert(encoding, &s, next, &dataPtr, dataBufEnd); characterDataHandler(userData, dataBuf, dataPtr - dataBuf); } while (s != next); } @@ -1484,7 +1484,7 @@ appendAttributeValue(XML_Parser parser, const ENCODING *enc, int isCdata, const char *ptr, const char *end, STRING_POOL *pool) { - const ENCODING *utf8 = XmlGetInternalEncoding(XML_UTF8_ENCODING); + const ENCODING *utf8 = XmlGetUtf8InternalEncoding(); for (;;) { const char *next; int tok = XmlAttributeValueTok(enc, ptr, end, &next); @@ -1510,7 +1510,7 @@ appendAttributeValue(XML_Parser parser, const ENCODING *enc, int isCdata, && n == ' ' && (poolLength(pool) == 0 || poolLastByte(pool) == ' ')) break; - n = XmlEncode(utf8, n, buf); + n = XmlUtf8Encode(n, buf); if (!n) { errorPtr = ptr; return XML_ERROR_BAD_CHAR_REF; @@ -1596,7 +1596,7 @@ enum XML_Error storeEntityValue(XML_Parser parser, const char *entityTextPtr, const char *entityTextEnd) { - const ENCODING *utf8 = XmlGetInternalEncoding(XML_UTF8_ENCODING); + const ENCODING *utf8 = XmlGetUtf8InternalEncoding(); STRING_POOL *pool = &(dtd.pool); entityTextPtr += encoding->minBytesPerChar; entityTextEnd -= encoding->minBytesPerChar; @@ -1638,7 +1638,7 @@ enum XML_Error storeEntityValue(XML_Parser parser, errorPtr = entityTextPtr; return XML_ERROR_BAD_CHAR_REF; } - n = XmlEncode(utf8, n, buf); + n = XmlUtf8Encode(n, buf); if (!n) { errorPtr = entityTextPtr; return XML_ERROR_BAD_CHAR_REF; @@ -2046,7 +2046,7 @@ char *poolAppend(STRING_POOL *pool, const ENCODING *enc, if (!pool->ptr && !poolGrow(pool)) return 0; for (;;) { - XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &(pool->ptr), pool->end); + XmlUtf8Convert(enc, &ptr, end, &(pool->ptr), pool->end); if (ptr == end) break; if (!poolGrow(pool)) diff --git a/expat/xmltok/xmltok.c b/expat/xmltok/xmltok.c index da556ffb..e359c01c 100755 --- a/expat/xmltok/xmltok.c +++ b/expat/xmltok/xmltok.c @@ -34,11 +34,7 @@ Contributor(s): PREFIX(updatePosition), \ PREFIX(isPublicId) -#define VTABLE2 \ - PREFIX(encode), \ - { PREFIX(toUtf8) } - -#define VTABLE VTABLE1, VTABLE2 +#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) #define UCS2_GET_NAMING(pages, hi, lo) \ (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) @@ -116,48 +112,13 @@ static int checkCharRefNumber(int); #undef IS_NMSTRT_CHAR #undef IS_INVALID_CHAR -enum { - /* cvalN is value of masked first byte of N byte sequence */ - cval1 = 0x00, - cval2 = 0xc0, - cval3 = 0xe0, - cval4 = 0xf0, - /* minN is minimum legal resulting value for N byte sequence */ - min2 = 0x80, - min3 = 0x800, - min4 = 0x10000 +enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ + UTF8_cval1 = 0x00, + UTF8_cval2 = 0xc0, + UTF8_cval3 = 0xe0, + UTF8_cval4 = 0xf0 }; -static -int utf8_encode(const ENCODING *enc, int c, char *buf) -{ - if (c < 0) - return 0; - if (c < min2) { - buf[0] = (c | cval1); - return 1; - } - if (c < min3) { - buf[0] = ((c >> 6) | cval2); - buf[1] = ((c & 0x3f) | 0x80); - return 2; - } - if (c < min4) { - buf[0] = ((c >> 12) | cval3); - buf[1] = (((c >> 6) & 0x3f) | 0x80); - buf[2] = ((c & 0x3f) | 0x80); - return 3; - } - if (c < 0x110000) { - buf[0] = ((c >> 18) | cval4); - buf[1] = (((c >> 12) & 0x3f) | 0x80); - buf[2] = (((c >> 6) & 0x3f) | 0x80); - buf[3] = ((c & 0x3f) | 0x80); - return 4; - } - return 0; -} - static void utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, @@ -177,8 +138,16 @@ void utf8_toUtf8(const ENCODING *enc, *toP = to; } +static +void utf8_toUtf16(const ENCODING *enc, + const char **fromP, const char *fromLim, + UTF16_CHAR **toP, const UTF16_CHAR *toLim) +{ + /* FIXME */ +} + static const struct normal_encoding utf8_encoding = { - { VTABLE1, utf8_encode, { utf8_toUtf8 }, 1 }, + { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1 }, { #include "asciitab.h" #include "utf8tab.h" @@ -186,25 +155,13 @@ static const struct normal_encoding utf8_encoding = { }; static const struct normal_encoding internal_utf8_encoding = { - { VTABLE1, utf8_encode, { utf8_toUtf8 }, 1 }, + { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1 }, { #include "iasciitab.h" #include "utf8tab.h" } }; -static -int latin1_encode(const ENCODING *enc, int c, char *buf) -{ - if (c < 0) - return 0; - if (c <= 0xFF) { - buf[0] = (char)c; - return 1; - } - return 0; -} - static void latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, @@ -218,7 +175,7 @@ void latin1_toUtf8(const ENCODING *enc, if (c & 0x80) { if (toLim - *toP < 2) break; - *(*toP)++ = ((c >> 6) | cval2); + *(*toP)++ = ((c >> 6) | UTF8_cval2); *(*toP)++ = ((c & 0x3f) | 0x80); (*fromP)++; } @@ -230,8 +187,15 @@ void latin1_toUtf8(const ENCODING *enc, } } +static +void latin1_toUtf16(const ENCODING *enc, + const char **fromP, const char *fromLim, + UTF16_CHAR **toP, const UTF16_CHAR *toLim) +{ +} + static const struct normal_encoding latin1_encoding = { - { VTABLE1, latin1_encode, { latin1_toUtf8 }, 1 }, + { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1 }, { #include "asciitab.h" #include "latin1tab.h" @@ -240,18 +204,6 @@ static const struct normal_encoding latin1_encoding = { #define latin1tab (latin1_encoding.type) -static -int ascii_encode(const ENCODING *enc, int c, char *buf) -{ - if (c < 0) - return 0; - if (c <= 0x7F) { - buf[0] = (char)c; - return 1; - } - return 0; -} - static void ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, @@ -261,8 +213,16 @@ void ascii_toUtf8(const ENCODING *enc, *(*toP)++ = *(*fromP)++; } +static +void ascii_toUtf16(const ENCODING *enc, + const char **fromP, const char *fromLim, + UTF16_CHAR **toP, const UTF16_CHAR *toLim) +{ + /* FIXME */ +} + static const struct normal_encoding ascii_encoding = { - { VTABLE1, ascii_encode, { ascii_toUtf8 }, 1 }, + { VTABLE1, ascii_toUtf8, ascii_toUtf16, 1 }, { #include "asciitab.h" /* BT_NONXML == 0 */ @@ -289,25 +249,6 @@ static int unicode_byte_type(char hi, char lo) return BT_NONASCII; } -#define DEFINE_UTF16_ENCODE \ -static \ -int PREFIX(encode)(const ENCODING *enc, int charNum, char *buf) \ -{ \ - if (charNum < 0) \ - return 0; \ - if (charNum < 0x10000) { \ - SET2(buf, charNum); \ - return 2; \ - } \ - if (charNum < 0x110000) { \ - charNum -= 0x10000; \ - SET2(buf, (charNum >> 10) + 0xD800); \ - SET2(buf + 2, (charNum & 0x3FF) + 0xDC00); \ - return 4; \ - } \ - return 0; \ -} - #define DEFINE_UTF16_TO_UTF8 \ static \ void PREFIX(toUtf8)(const ENCODING *enc, \ @@ -337,7 +278,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \ *fromP = from; \ return; \ } \ - *(*toP)++ = ((lo >> 6) | (hi << 2) | cval2); \ + *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ *(*toP)++ = ((lo & 0x3f) | 0x80); \ break; \ default: \ @@ -346,7 +287,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \ return; \ } \ /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ - *(*toP)++ = ((hi >> 4) | cval3); \ + *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ *(*toP)++ = ((lo & 0x3f) | 0x80); \ break; \ @@ -356,7 +297,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \ return; \ } \ plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ - *(*toP)++ = ((plane >> 2) | cval4); \ + *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ from += 2; \ lo2 = GET_LO(from); \ @@ -371,6 +312,15 @@ void PREFIX(toUtf8)(const ENCODING *enc, \ *fromP = from; \ } +#define DEFINE_UTF16_TO_UTF16 \ +static \ +void PREFIX(toUtf16)(const ENCODING *enc, \ + const char **fromP, const char *fromLim, \ + UTF16_CHAR **toP, const UTF16_CHAR *toLim) \ +{ \ + /* FIXME */ \ +} + #define PREFIX(ident) little2_ ## ident #define MINBPC 2 #define BYTE_TYPE(enc, p) \ @@ -389,8 +339,8 @@ void PREFIX(toUtf8)(const ENCODING *enc, \ #define GET_LO(ptr) ((unsigned char)(ptr)[0]) #define GET_HI(ptr) ((unsigned char)(ptr)[1]) -DEFINE_UTF16_ENCODE DEFINE_UTF16_TO_UTF8 +DEFINE_UTF16_TO_UTF16 #undef SET2 #undef GET_LO @@ -426,8 +376,8 @@ static const struct encoding little2_encoding = { VTABLE, 2 }; #define GET_LO(ptr) ((unsigned char)(ptr)[1]) #define GET_HI(ptr) ((unsigned char)(ptr)[0]) -DEFINE_UTF16_ENCODE DEFINE_UTF16_TO_UTF8 +DEFINE_UTF16_TO_UTF16 #undef SET2 #undef GET_LO @@ -523,12 +473,14 @@ void initUpdatePosition(const ENCODING *enc, const char *ptr, normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); } -const ENCODING *XmlGetInternalEncoding(int e) +const ENCODING *XmlGetUtf8InternalEncoding() { - switch (e) { - case XML_UTF8_ENCODING: - return &internal_utf8_encoding.enc; - } + return &internal_utf8_encoding.enc; +} + +const ENCODING *XmlGetUtf16InternalEncoding() +{ + /* FIXME */ return 0; } @@ -564,7 +516,7 @@ int toAscii(const ENCODING *enc, const char *ptr, const char *end) { char buf[1]; char *p = buf; - XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &p, p + 1); + XmlUtf8Convert(enc, &ptr, end, &p, p + 1); if (p == buf) return -1; else @@ -674,7 +626,7 @@ const ENCODING *findEncoding(const ENCODING *enc, const char *ptr, const char *e char buf[ENCODING_MAX]; char *p = buf; int i; - XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &p, p + ENCODING_MAX - 1); + XmlUtf8Convert(enc, &ptr, end, &p, p + ENCODING_MAX - 1); if (ptr != end) return 0; *p = 0; @@ -792,3 +744,55 @@ int checkCharRefNumber(int result) return result; } +int XmlUtf8Encode(int c, char *buf) +{ + enum { + /* minN is minimum legal resulting value for N byte sequence */ + min2 = 0x80, + min3 = 0x800, + min4 = 0x10000 + }; + + if (c < 0) + return 0; + if (c < min2) { + buf[0] = (c | UTF8_cval1); + return 1; + } + if (c < min3) { + buf[0] = ((c >> 6) | UTF8_cval2); + buf[1] = ((c & 0x3f) | 0x80); + return 2; + } + if (c < min4) { + buf[0] = ((c >> 12) | UTF8_cval3); + buf[1] = (((c >> 6) & 0x3f) | 0x80); + buf[2] = ((c & 0x3f) | 0x80); + return 3; + } + if (c < 0x110000) { + buf[0] = ((c >> 18) | UTF8_cval4); + buf[1] = (((c >> 12) & 0x3f) | 0x80); + buf[2] = (((c >> 6) & 0x3f) | 0x80); + buf[3] = ((c & 0x3f) | 0x80); + return 4; + } + return 0; +} + +int XmlUtf16Encode(int charNum, UTF16_CHAR *buf) +{ + if (charNum < 0) + return 0; + if (charNum < 0x10000) { + buf[0] = charNum; + return 1; + } + if (charNum < 0x110000) { + charNum -= 0x10000; + buf[0] = (charNum >> 10) + 0xD800; + buf[1] = (charNum & 0x3FF) + 0xDC00; + return 2; + } + return 0; +} diff --git a/expat/xmltok/xmltok.h b/expat/xmltok/xmltok.h index 61cd8286..98320c13 100755 --- a/expat/xmltok/xmltok.h +++ b/expat/xmltok/xmltok.h @@ -103,13 +103,6 @@ extern "C" { #define XML_ATTRIBUTE_VALUE_LITERAL 0 #define XML_ENTITY_VALUE_LITERAL 1 -#define XML_N_INTERNAL_ENCODINGS 1 -#define XML_UTF8_ENCODING 0 -#if 0 -#define XML_UTF16_ENCODING 1 -#define XML_UCS4_ENCODING 2 -#endif - #define XML_MAX_BYTES_PER_CHAR 4 typedef struct position { @@ -128,6 +121,8 @@ typedef struct { struct encoding; typedef struct encoding ENCODING; +typedef unsigned short UTF16_CHAR; + struct encoding { int (*scanners[XML_N_STATES])(const ENCODING *, const char *, @@ -152,14 +147,16 @@ struct encoding { POSITION *); int (*isPublicId)(const ENCODING *enc, const char *ptr, const char *end, const char **badPtr); - int (*encode)(const ENCODING *enc, - int charNum, - char *buf); - void (*convert[XML_N_INTERNAL_ENCODINGS])(const ENCODING *enc, - const char **fromP, - const char *fromLim, - char **toP, - const char *toLim); + void (*utf8Convert)(const ENCODING *enc, + const char **fromP, + const char *fromLim, + char **toP, + const char *toLim); + void (*utf16Convert)(const ENCODING *enc, + const char **fromP, + const char *fromLim, + UTF16_CHAR **toP, + const UTF16_CHAR *toLim); int minBytesPerChar; }; @@ -231,11 +228,11 @@ the content of a literal that has already been returned by XmlTok. */ #define XmlIsPublicId(enc, ptr, end, badPtr) \ (((enc)->isPublicId)(enc, ptr, end, badPtr)) -#define XmlEncode(enc, ch, buf) \ - (((enc)->encode)(enc, ch, buf)) +#define XmlUtf8Convert(enc, fromP, fromLim, toP, toLim) \ + (((enc)->utf8Convert)(enc, fromP, fromLim, toP, toLim)) -#define XmlConvert(enc, targetEnc, fromP, fromLim, toP, toLim) \ - (((enc)->convert[targetEnc])(enc, fromP, fromLim, toP, toLim)) +#define XmlUtf16Convert(enc, fromP, fromLim, toP, toLim) \ + (((enc)->utf16Convert)(enc, fromP, fromLim, toP, toLim)) typedef struct { ENCODING initEnc; @@ -253,7 +250,10 @@ int XMLTOKAPI XmlParseXmlDecl(int isGeneralTextEntity, int *standalonePtr); int XMLTOKAPI XmlInitEncoding(INIT_ENCODING *, const ENCODING **, const char *name); -const ENCODING XMLTOKAPI *XmlGetInternalEncoding(int); +const ENCODING XMLTOKAPI *XmlGetUtf8InternalEncoding(); +const ENCODING XMLTOKAPI *XmlGetUtf16InternalEncoding(); +int XMLTOKAPI XmlUtf8Encode(int charNumber, char *buf); +int XMLTOKAPI XmlUtf16Encode(int charNumber, UTF16_CHAR *buf); #ifdef __cplusplus }