Rewrite for proper multibyte support and more well-formedness checking.

This commit is contained in:
James Clark 1997-11-10 06:10:50 +00:00
parent 7834fbbf8a
commit 9ecf532908
4 changed files with 293 additions and 431 deletions

View file

@ -54,7 +54,6 @@ INTDIR=.\Release
ALL : "$(OUTDIR)\xmltok.dll"
CLEAN :
-@erase "$(INTDIR)\wxmltok.obj"
-@erase "$(INTDIR)\xmltok.obj"
-@erase "$(OUTDIR)\xmltok.dll"
-@erase "$(OUTDIR)\xmltok.exp"
@ -112,7 +111,6 @@ LINK32_FLAGS=kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib\
/pdb:"$(OUTDIR)/xmltok.pdb" /machine:I386 /out:"$(OUTDIR)/xmltok.dll"\
/implib:"$(OUTDIR)/xmltok.lib"
LINK32_OBJS= \
"$(INTDIR)\wxmltok.obj" \
"$(INTDIR)\xmltok.obj"
"$(OUTDIR)\xmltok.dll" : "$(OUTDIR)" $(DEF_FILE) $(LINK32_OBJS)
@ -140,7 +138,6 @@ ALL : "$(OUTDIR)\xmltok.dll"
CLEAN :
-@erase "$(INTDIR)\vc40.idb"
-@erase "$(INTDIR)\vc40.pdb"
-@erase "$(INTDIR)\wxmltok.obj"
-@erase "$(INTDIR)\xmltok.obj"
-@erase "$(OUTDIR)\xmltok.dll"
-@erase "$(OUTDIR)\xmltok.exp"
@ -199,7 +196,6 @@ LINK32_FLAGS=kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib\
/pdb:"$(OUTDIR)/xmltok.pdb" /debug /machine:I386 /out:"$(OUTDIR)/xmltok.dll"\
/implib:"$(OUTDIR)/xmltok.lib"
LINK32_OBJS= \
"$(INTDIR)\wxmltok.obj" \
"$(INTDIR)\xmltok.obj"
"$(OUTDIR)\xmltok.dll" : "$(OUTDIR)" $(DEF_FILE) $(LINK32_OBJS)
@ -380,37 +376,40 @@ LINK32_OBJS= \
# Begin Source File
SOURCE=.\xmltok.c
DEP_CPP_XMLTO=\
".\xmltok.h"\
"$(INTDIR)\xmltok.obj" : $(SOURCE) $(DEP_CPP_XMLTO) "$(INTDIR)"
# End Source File
################################################################################
# Begin Source File
SOURCE=.\wxmltok.c
!IF "$(CFG)" == "xmltok - Win32 Release"
DEP_CPP_WXMLT=\
".\xmltok.c"\
DEP_CPP_XMLTO=\
".\asciitab.h"\
".\latin1tab.h"\
".\nametab.h"\
".\utf8tab.h"\
".\xmltok.h"\
".\xmltok_impl.c"\
".\xmltok_impl.h"\
# ADD CPP /Ob2
"$(INTDIR)\wxmltok.obj" : $(SOURCE) $(DEP_CPP_WXMLT) "$(INTDIR)" ".\xmltok.c"
"$(INTDIR)\xmltok.obj" : $(SOURCE) $(DEP_CPP_XMLTO) "$(INTDIR)"
$(CPP) /nologo /MT /W3 /GX /O2 /Ob2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS"\
/Fp"$(INTDIR)/xmltok.pch" /YX /Fo"$(INTDIR)/" /c $(SOURCE)
!ELSEIF "$(CFG)" == "xmltok - Win32 Debug"
DEP_CPP_WXMLT=\
".\xmltok.c"\
DEP_CPP_XMLTO=\
".\asciitab.h"\
".\latin1tab.h"\
".\nametab.h"\
".\utf8tab.h"\
".\xmltok.h"\
".\xmltok_impl.c"\
".\xmltok_impl.h"\
"$(INTDIR)\wxmltok.obj" : $(SOURCE) $(DEP_CPP_WXMLT) "$(INTDIR)" ".\xmltok.c"
"$(INTDIR)\xmltok.obj" : $(SOURCE) $(DEP_CPP_XMLTO) "$(INTDIR)"
$(CPP) /nologo /MTd /W3 /Gm /GX /Zi /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS"\
/Fp"$(INTDIR)/xmltok.pch" /YX /Fo"$(INTDIR)/" /Fd"$(INTDIR)/" /c $(SOURCE)
!ENDIF
@ -451,6 +450,9 @@ DEP_CPP_WXMLT=\
# Begin Source File
SOURCE=.\xmlec\xmlec.c
!IF "$(CFG)" == "xmlec - Win32 Release"
DEP_CPP_XMLEC=\
".\xmltok.h"\
@ -459,6 +461,18 @@ DEP_CPP_XMLEC=\
$(CPP) $(CPP_PROJ) $(SOURCE)
!ELSEIF "$(CFG)" == "xmlec - Win32 Debug"
DEP_CPP_XMLEC=\
".\xmltok.h"\
"$(INTDIR)\xmlec.obj" : $(SOURCE) $(DEP_CPP_XMLEC) "$(INTDIR)"
$(CPP) $(CPP_PROJ) $(SOURCE)
!ENDIF
# End Source File
# End Target
# End Project

View file

@ -11,7 +11,8 @@
#include <windows.h>
static
int XmlSkipProlog(const char **s, const char *end, const char **nextTokP);
int XmlSkipProlog(const char **s, const char *end, const char **nextTokP,
const ENCODING **enc);
int XmlParse(const char *s, size_t n, const char *filename)
{
@ -19,7 +20,8 @@ int XmlParse(const char *s, size_t n, const char *filename)
const char *start = s;
const char *end = s + n;
const char *next;
int tok = XmlSkipProlog(&s, end, &next);
const ENCODING *enc;
int tok = XmlSkipProlog(&s, end, &next, &enc);
for (;;) {
switch (tok) {
case XML_TOK_NONE:
@ -31,12 +33,15 @@ int XmlParse(const char *s, size_t n, const char *filename)
return 1;
case XML_TOK_INVALID:
fprintf(stderr, "%s: well-formedness error at byte %lu\n",
filename, (unsigned long)(s - start));
filename, (unsigned long)(next - start));
return 0;
case XML_TOK_PARTIAL:
fprintf(stderr, "%s: unclosed token started at byte %lu\n",
filename, (unsigned long)(s - start));
return 0;
case XML_TOK_PARTIAL_CHAR:
fprintf(stderr, "%s: malformed input\n", filename);
return 0;
case XML_TOK_COMMENT:
break;
case XML_TOK_START_TAG:
@ -46,22 +51,25 @@ int XmlParse(const char *s, size_t n, const char *filename)
break;
}
s = next;
tok = XmlContentTok(s, end, &next);
tok = XmlContentTok(enc, s, end, &next);
}
/* not reached */
}
static
int XmlSkipProlog(const char **startp, const char *end, const char **nextTokP)
int XmlSkipProlog(const char **startp, const char *end,
const char **nextTokP, const ENCODING **enc)
{
const char *s = *startp;
INIT_ENCODING initEnc;
XmlInitEncoding(&initEnc, enc);
for (;;) {
int tok = XmlPrologTok(s, end, nextTokP);
int tok = XmlPrologTok(*enc, s, end, nextTokP);
switch (tok) {
case XML_TOK_NONE:
case XML_TOK_INVALID:
case XML_TOK_PARTIAL:
case XML_TOK_START_TAG:
case XML_TOK_INVALID:
case XML_TOK_NONE:
case XML_TOK_PARTIAL:
*startp = s;
return tok;
default:
@ -122,9 +130,11 @@ struct XmlTokBuffer {
char *ptr;
size_t size;
int fd;
int doneProlog;
int state;
int eof;
unsigned long endOffset;
const ENCODING *enc;
INIT_ENCODING initEnc;
};
#define XmlTokBufferOffset(tb) ((tb)->endOffset - ((tb)->end - (tb)->ptr))
@ -145,9 +155,10 @@ void XmlTokBufferInit(struct XmlTokBuffer *tb, int fd)
tb->ptr = tb->buf;
tb->size = READSIZE;
tb->fd = fd;
tb->doneProlog = 0;
tb->state = XML_PROLOG_STATE;
tb->eof = 0;
tb->endOffset = 0;
XmlInitEncoding(&(tb->initEnc), &(tb->enc));
}
void XmlTokBufferFree(struct XmlTokBuffer *tb)
@ -161,14 +172,10 @@ int XmlGetToken(struct XmlTokBuffer *tb, const char **tokStart, size_t *tokLengt
for (;;) {
int nBytes;
const char *start = tb->ptr;
if (!tb->doneProlog) {
tok = XmlPrologTok(start, tb->end, &tb->ptr);
if (tok == XML_TOK_START_TAG)
tb->doneProlog = 1;
}
else
tok = XmlContentTok(start, tb->end, &tb->ptr);
tok = XmlTok(tb->enc, tb->state, start, tb->end, &tb->ptr);
if (tok >= 0) {
if (tok == XML_TOK_START_TAG)
tb->state = XML_CONTENT_STATE;
*tokStart = start;
*tokLength = tb->ptr - start;
break;
@ -275,6 +282,7 @@ int main(int argc, char **argv)
fprintf(stderr, "usage: %s filename ...\n", argv[0]);
return 1;
}
fprintf(stderr, "version 0.1\n");
for (i = 1; i < argc; i++)
if (!doFile(argv[i]))
ret = 1;

View file

@ -1,389 +1,221 @@
/* TODO
Provide methods to convert to any of UTF-8, UTF-18, UCS-4.
Better prolog tokenization
<!NAME
NMTOKEN
NAME
PEREF
*/
#ifdef _MSC_VER
#define XMLTOKAPI __declspec(dllexport)
#endif
#include "xmltok.h"
#include "nametab.h"
#ifdef UNICODE
typedef wchar_t TCHAR;
#else
typedef char TCHAR;
#endif
#define UCS2_GET_NAMING(pages, hi, lo) \
(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
#define DIGIT_CASES \
case '0': case '1': case '2': case '3': case '4': \
case '5': case '6': case '7': case '8': case '9':
/* A 2 byte UTF-8 representation splits the characters 11 bits
between the bottom 5 and 6 bits of the bytes.
We need 8 bits to index into pages, 3 bits to add to that index and
5 bits to generate the mask. */
#define UTF8_GET_NAMING2(pages, byte) \
(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
+ ((((byte)[0]) & 3) << 1) \
+ ((((byte)[1]) >> 5) & 1)] \
& (1 << (((byte)[1]) & 0x1F)))
#define HEX_DIGIT_CASES DIGIT_CASES \
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': \
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
/* A 3 byte UTF-8 representation splits the characters 16 bits
between the bottom 4, 6 and 6 bits of the bytes.
We need 8 bits to index into pages, 3 bits to add to that index and
5 bits to generate the mask. */
#define UTF8_GET_NAMING3(pages, byte) \
(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
+ ((((byte)[1]) >> 2) & 0xF)] \
<< 3) \
+ ((((byte)[1]) & 3) << 1) \
+ ((((byte)[2]) >> 5) & 1)] \
& (1 << (((byte)[2]) & 0x1F)))
#define S_CASES case ' ': case '\t': case '\r': case '\n':
#define UTF8_GET_NAMING(pages, p, n) \
((n) == 2 \
? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
: ((n) == 3 \
? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
: 0))
/* ptr points to character following "<!-" */
static
int scanComment(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
#include "xmltok_impl.h"
struct normal_encoding {
ENCODING enc;
unsigned char type[256];
};
/* minimum bytes per character */
#define MINBPC 1
#define BYTE_TYPE(enc, p) \
(((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
#define IS_NAME_CHAR(enc, p, n) UTF8_GET_NAMING(namePages, p, n)
#define IS_NMSTRT_CHAR(enc, p, n) UTF8_GET_NAMING(nmstrtPages, p, n)
/* c is an ASCII character */
#define CHAR_MATCHES(enc, p, c) (*(p) == c)
#define PREFIX(ident) normal_ ## ident
#include "xmltok_impl.c"
#undef MINBPC
#undef BYTE_TYPE
#undef CHAR_MATCHES
#undef IS_NAME_CHAR
#undef IS_NMSTRT_CHAR
const struct normal_encoding utf8_encoding = {
{ { PREFIX(prologTok), PREFIX(contentTok) }, 1 },
#include "asciitab.h"
#include "utf8tab.h"
};
#undef PREFIX
static unsigned char latin1tab[256] = {
#include "asciitab.h"
#include "latin1tab.h"
};
static int unicode_byte_type(char hi, char lo)
{
if (ptr != end) {
if (*ptr != '-') {
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
for (++ptr; ptr != end; ptr++) {
if (*ptr == '-') {
if (++ptr == end)
return XML_TOK_PARTIAL;
if (*ptr == '-') {
if (++ptr == end)
return XML_TOK_PARTIAL;
if (*ptr != '>') {
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
*nextTokPtr = ptr + 1;
return XML_TOK_COMMENT;
}
}
switch ((unsigned char)hi) {
case 0xD8: case 0xD9: case 0xDA: case 0xDB:
return BT_LEAD4;
case 0xDC: case 0xDD: case 0xDE: case 0xDF:
return BT_TRAIL;
case 0xFF:
switch ((unsigned char)lo) {
case 0xFF:
case 0xFE:
return BT_NONXML;
}
break;
}
return XML_TOK_PARTIAL;
return BT_NONASCII;
}
/* ptr points to character following "<!" */
#define PREFIX(ident) little2_ ## ident
#define MINBPC 2
#define BYTE_TYPE(enc, p) \
((p)[1] == 0 ? latin1tab[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0]))
#define CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
#define IS_NAME_CHAR(enc, p, n) \
UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
#define IS_NMSTRT_CHAR(enc, p, n) \
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
#include "xmltok_impl.c"
#undef MINBPC
#undef BYTE_TYPE
#undef CHAR_MATCHES
#undef IS_NAME_CHAR
#undef IS_NMSTRT_CHAR
const struct encoding little2_encoding = {
{ PREFIX(prologTok), PREFIX(contentTok) }, 2
};
#undef PREFIX
#define PREFIX(ident) big2_ ## ident
#define MINBPC 2
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
#define BYTE_TYPE(enc, p) \
((p)[0] == 0 ? latin1tab[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1]))
#define CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
#define IS_NAME_CHAR(enc, p, n) \
UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
#define IS_NMSTRT_CHAR(enc, p, n) \
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
#include "xmltok_impl.c"
#undef MINBPC
#undef BYTE_TYPE
#undef CHAR_MATCHES
#undef IS_NAME_CHAR
#undef IS_NMSTRT_CHAR
const struct encoding big2_encoding = {
{ PREFIX(prologTok), PREFIX(contentTok) }, 2
};
#undef PREFIX
static
int scanDecl(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
int initScan(const ENCODING *enc, int state, const char *ptr, const char *end,
const char **nextTokPtr)
{
if (ptr != end) {
if (*ptr == '-')
return scanComment(ptr + 1, end, nextTokPtr);
do {
switch (*ptr) {
case '\'':
case '"':
case '<':
*nextTokPtr = ptr;
return XML_TOK_PROLOG_CHARS;
}
} while (++ptr != end);
*nextTokPtr = ptr;
return XML_TOK_PROLOG_CHARS;
}
return XML_TOK_PARTIAL;
}
const ENCODING **encPtr;
/* ptr points to character following "<?" */
static
int scanPi(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
{
for (; ptr != end; ++ptr) {
switch (*ptr) {
case '?':
if (ptr + 1 == end)
return XML_TOK_PARTIAL;
if (ptr[1] == '>') {
*nextTokPtr = ptr + 2;
return XML_TOK_PI;
}
if (ptr == end)
return XML_TOK_NONE;
encPtr = ((const INIT_ENCODING *)enc)->encPtr;
if (ptr + 1 == end) {
switch ((unsigned char)*ptr) {
case 0xFE:
case 0xFF:
case 0x00:
case 0x3C:
return XML_TOK_PARTIAL;
}
}
return XML_TOK_PARTIAL;
}
/* ptr points to character following "<" */
static
int scanStartTag(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
{
for (; ptr != end; ++ptr) {
switch (*ptr) {
case '<':
*nextTokPtr = ptr;
return XML_TOK_INVALID;
case '>':
*nextTokPtr = ptr + 1;
return XML_TOK_START_TAG;
case '"':
for (++ptr;; ++ptr) {
if (ptr == end)
return XML_TOK_PARTIAL;
if (*ptr == '"')
break;
}
break;
case '\'':
for (++ptr;; ++ptr) {
if (ptr == end)
return XML_TOK_PARTIAL;
if (*ptr == '\'')
break;
}
break;
case '/':
if (++ptr == end)
return XML_TOK_PARTIAL;
if (*ptr != '>') {
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
*nextTokPtr = ptr + 1;
return XML_TOK_EMPTY_ELEMENT;
else {
switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
case 0x003C:
*encPtr = &big2_encoding;
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
case 0xFEFF:
*nextTokPtr = ptr + 2;
*encPtr = &big2_encoding;
return XML_TOK_BOM;
case 0x3C00:
*encPtr = &little2_encoding;
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
case 0xFFFE:
*nextTokPtr = ptr + 2;
*encPtr = &little2_encoding;
return XML_TOK_BOM;
}
}
return XML_TOK_PARTIAL;
}
/* ptr points to character following "</" */
static
int scanEndTag(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
{
for (; ptr != end; ++ptr) {
switch (*ptr) {
case '<':
case '&':
*nextTokPtr = ptr;
return XML_TOK_INVALID;
case '>':
*nextTokPtr = ptr + 1;
return XML_TOK_END_TAG;
}
}
return XML_TOK_PARTIAL;
}
/* ptr points to character following "&#X" */
static
int scanHexCharRef(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
{
if (ptr != end) {
switch (*ptr) {
HEX_DIGIT_CASES
break;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
for (++ptr; ptr != end; ++ptr) {
switch (*ptr) {
HEX_DIGIT_CASES
break;
case ';':
*nextTokPtr = ptr + 1;
return XML_TOK_CHAR_REF;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
}
}
return XML_TOK_PARTIAL;
}
/* ptr points to character following "&#" */
static
int scanCharRef(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
{
if (ptr != end) {
switch (*ptr) {
case 'x':
case 'X':
return scanHexCharRef(ptr + 1, end, nextTokPtr);
DIGIT_CASES
break;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
for (++ptr; ptr != end; ++ptr) {
switch (*ptr) {
DIGIT_CASES
break;
case ';':
*nextTokPtr = ptr + 1;
return XML_TOK_CHAR_REF;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
}
}
return XML_TOK_PARTIAL;
*encPtr = &utf8_encoding.enc;
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
}
static
int scanEntityRef(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
int initScanProlog(const ENCODING *enc, const char *ptr, const char *end,
const char **nextTokPtr)
{
for (; ptr != end; ++ptr) {
switch (*ptr) {
case '<':
case '>':
case '&':
S_CASES
*nextTokPtr = ptr;
return XML_TOK_INVALID;
case ';':
*nextTokPtr = ptr + 1;
return XML_TOK_ENTITY_REF;
}
}
return XML_TOK_PARTIAL;
return initScan(enc, XML_PROLOG_STATE, ptr, end, nextTokPtr);
}
/* ptr points to character following "<![" */
static
int scanCdataSection(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
int initScanContent(const ENCODING *enc, const char *ptr, const char *end,
const char **nextTokPtr)
{
int i;
/* CDATA[]]> */
if (end - ptr < 9)
return XML_TOK_PARTIAL;
for (i = 0; i < 6; i++, ptr++) {
if (*ptr != "CDATA["[i]) {
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
}
end -= 2;
for (; ptr != end; ++ptr) {
if (*ptr == ']') {
if (ptr[1] == ']' && ptr[2] == '>') {
*nextTokPtr = ptr + 3;
return XML_TOK_CDATA_SECTION;
}
}
}
return XML_TOK_PARTIAL;
return initScan(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr);
}
int XmlContentTok(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
void XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr)
{
if (ptr != end) {
switch (*ptr) {
case '<':
{
++ptr;
if (ptr == end)
return XML_TOK_PARTIAL;
switch (*ptr) {
case '!':
if (++ptr == end)
return XML_TOK_PARTIAL;
switch (*ptr) {
case '-':
return scanComment(ptr + 1, end, nextTokPtr);
case '[':
return scanCdataSection(ptr + 1, end, nextTokPtr);
}
*nextTokPtr = ptr;
return XML_TOK_INVALID;
case '?':
return scanPi(ptr + 1, end, nextTokPtr);
case '/':
return scanEndTag(ptr + 1, end, nextTokPtr);
case '>':
S_CASES
*nextTokPtr = ptr;
return XML_TOK_INVALID;
default:
return scanStartTag(ptr, end, nextTokPtr);
}
}
case '&':
{
++ptr;
if (ptr == end)
return XML_TOK_PARTIAL;
switch (*ptr) {
case '#':
return scanCharRef(ptr + 1, end, nextTokPtr);
S_CASES
case ';':
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
return scanEntityRef(ptr + 1, end, nextTokPtr);
}
default:
{
for (++ptr; ptr != end; ++ptr) {
switch (*ptr) {
case '&':
case '<':
*nextTokPtr = ptr;
return XML_TOK_DATA_CHARS;
}
}
*nextTokPtr = ptr;
return XML_TOK_DATA_CHARS;
}
}
}
return XML_TOK_NONE;
}
int XmlPrologTok(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
{
if (ptr != end) {
switch (*ptr) {
case '"':
{
for (++ptr; ptr != end; ++ptr) {
if (*ptr == '"') {
*nextTokPtr = ptr + 1;
return XML_TOK_LITERAL;
}
}
return XML_TOK_PARTIAL;
}
case '\'':
{
for (++ptr; ptr != end; ++ptr) {
if (*ptr == '\'') {
*nextTokPtr = ptr + 1;
return XML_TOK_LITERAL;
}
}
return XML_TOK_PARTIAL;
}
case '<':
{
++ptr;
if (ptr == end)
return XML_TOK_PARTIAL;
switch (*ptr) {
case '!':
return scanDecl(ptr + 1, end, nextTokPtr);
case '?':
return scanPi(ptr + 1, end, nextTokPtr);
case '/':
*nextTokPtr = ptr;
return XML_TOK_INVALID;
default:
return XmlContentTok(ptr - 1, end, nextTokPtr);
}
}
default:
{
for (++ptr; ptr != end; ++ptr) {
switch (*ptr) {
case '<':
case '"':
case '\'':
*nextTokPtr = ptr;
return XML_TOK_PROLOG_CHARS;
}
}
*nextTokPtr = ptr;
return XML_TOK_PROLOG_CHARS;
}
}
}
return XML_TOK_NONE;
p->initEnc.scanners[XML_PROLOG_STATE] = initScanProlog;
p->initEnc.scanners[XML_CONTENT_STATE] = initScanContent;
p->initEnc.minBytesPerChar = 1;
p->encPtr = encPtr;
*encPtr = &(p->initEnc);
}

View file

@ -1,40 +1,53 @@
#ifndef XmlTok_INCLUDED
#define XmlTok_INCLUDED 1
#ifdef __cplusplus
extern "C" {
#endif
#ifndef XMLTOKAPI
#define XMLTOKAPI /* as nothing */
#endif
#include <stddef.h>
/* The following tokens may be returned by both XmlPrologTok and XmlContentTok */
#define XML_TOK_NONE -2 /* The string to be scanned is empty */
#define XML_TOK_PARTIAL -1
#define XML_TOK_NONE -3 /* The string to be scanned is empty */
#define XML_TOK_PARTIAL_CHAR -2 /* only part of a multibyte sequence */
#define XML_TOK_PARTIAL -1 /* only part of a token */
#define XML_TOK_INVALID 0
#define XML_TOK_COMMENT 1
#define XML_TOK_PI 2 /* processing instruction */
#define XML_TOK_BOM 1 /* Byte order mark */
#define XML_TOK_COMMENT 2
#define XML_TOK_PI 3 /* processing instruction */
/* The following tokens are returned only by XmlPrologTok */
#define XML_TOK_LITERAL 3
#define XML_TOK_PROLOG_CHARS 4
#define XML_TOK_LITERAL 4
#define XML_TOK_PROLOG_CHARS 5
#define XML_TOK_PROLOG_S 6
/* The following token is returned by XmlPrologTok when it detects the end
of the prolog and is also returned by XmlContentTok */
#define XML_TOK_START_TAG 5
#define XML_TOK_START_TAG 7
/* The following tokens are returned only by XmlContentTok */
#define XML_TOK_END_TAG 6
#define XML_TOK_EMPTY_ELEMENT 7 /* empty element tag <e/> */
#define XML_TOK_DATA_CHARS 8
#define XML_TOK_CDATA_SECTION 9
#define XML_TOK_ENTITY_REF 10
#define XML_TOK_CHAR_REF 11 /* numeric character reference */
#define XML_TOK_END_TAG 8
#define XML_TOK_EMPTY_ELEMENT 9 /* empty element tag <e/> */
#define XML_TOK_DATA_CHARS 10
#define XML_TOK_CDATA_SECTION 11
#define XML_TOK_ENTITY_REF 12
#define XML_TOK_CHAR_REF 13 /* numeric character reference */
#ifdef __cplusplus
extern "C" {
#endif
#define XML_NSTATES 2
#define XML_PROLOG_STATE 0
#define XML_CONTENT_STATE 1
typedef struct encoding {
int (*scanners[XML_NSTATES])(const struct encoding *,
const char *,
const char *,
const char **);
int minBytesPerChar;
} ENCODING;
/*
Scan the string starting at ptr until the end of the next complete token,
@ -56,30 +69,25 @@ may be returned together. Similarly for characters in the prolog outside
literals, comments and processing instructions.
*/
int XMLTOKAPI XmlPrologTokA(const char *ptr,
const char *eptr,
const char **nextTokPtr);
int XMLTOKAPI XmlContentTokA(const char *ptr,
const char *eptr,
const char **nextTokPtr);
int XMLTOKAPI XmlPrologTokW(const wchar_t *ptr,
const wchar_t *eptr,
const wchar_t **nextTokPtr);
int XMLTOKAPI XmlContentTokW(const wchar_t *ptr,
const wchar_t *eptr,
const wchar_t **nextTokPtr);
#define XmlTok(enc, state, ptr, end, nextTokPtr) \
(((enc)->scanners[state])(enc, ptr, end, nextTokPtr))
#define XmlPrologTok(enc, ptr, end, nextTokPtr) \
XmlTok(enc, XML_PROLOG_STATE, ptr, end, nextTokPtr)
#define XmlContentTok(enc, ptr, end, nextTokPtr) \
XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
typedef struct {
ENCODING initEnc;
const ENCODING **encPtr;
} INIT_ENCODING;
void XMLTOKAPI XmlInitEncoding(INIT_ENCODING *, const ENCODING **);
#ifdef __cplusplus
}
#endif
#ifdef UNICODE
#define XmlPrologTok XmlPrologTokW
#define XmlContentTok XmlContentTokW
#else
#define XmlPrologTok XmlPrologTokA
#define XmlContentTok XmlContentTokA
#endif
#endif /* not XmlTok_INCLUDED */