mirror of
https://github.com/libexpat/libexpat.git
synced 2025-04-05 05:05:00 +00:00
Rewrite for proper multibyte support and more well-formedness checking.
This commit is contained in:
parent
7834fbbf8a
commit
9ecf532908
4 changed files with 293 additions and 431 deletions
|
@ -54,7 +54,6 @@ INTDIR=.\Release
|
|||
ALL : "$(OUTDIR)\xmltok.dll"
|
||||
|
||||
CLEAN :
|
||||
-@erase "$(INTDIR)\wxmltok.obj"
|
||||
-@erase "$(INTDIR)\xmltok.obj"
|
||||
-@erase "$(OUTDIR)\xmltok.dll"
|
||||
-@erase "$(OUTDIR)\xmltok.exp"
|
||||
|
@ -112,7 +111,6 @@ LINK32_FLAGS=kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib\
|
|||
/pdb:"$(OUTDIR)/xmltok.pdb" /machine:I386 /out:"$(OUTDIR)/xmltok.dll"\
|
||||
/implib:"$(OUTDIR)/xmltok.lib"
|
||||
LINK32_OBJS= \
|
||||
"$(INTDIR)\wxmltok.obj" \
|
||||
"$(INTDIR)\xmltok.obj"
|
||||
|
||||
"$(OUTDIR)\xmltok.dll" : "$(OUTDIR)" $(DEF_FILE) $(LINK32_OBJS)
|
||||
|
@ -140,7 +138,6 @@ ALL : "$(OUTDIR)\xmltok.dll"
|
|||
CLEAN :
|
||||
-@erase "$(INTDIR)\vc40.idb"
|
||||
-@erase "$(INTDIR)\vc40.pdb"
|
||||
-@erase "$(INTDIR)\wxmltok.obj"
|
||||
-@erase "$(INTDIR)\xmltok.obj"
|
||||
-@erase "$(OUTDIR)\xmltok.dll"
|
||||
-@erase "$(OUTDIR)\xmltok.exp"
|
||||
|
@ -199,7 +196,6 @@ LINK32_FLAGS=kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib\
|
|||
/pdb:"$(OUTDIR)/xmltok.pdb" /debug /machine:I386 /out:"$(OUTDIR)/xmltok.dll"\
|
||||
/implib:"$(OUTDIR)/xmltok.lib"
|
||||
LINK32_OBJS= \
|
||||
"$(INTDIR)\wxmltok.obj" \
|
||||
"$(INTDIR)\xmltok.obj"
|
||||
|
||||
"$(OUTDIR)\xmltok.dll" : "$(OUTDIR)" $(DEF_FILE) $(LINK32_OBJS)
|
||||
|
@ -380,37 +376,40 @@ LINK32_OBJS= \
|
|||
# Begin Source File
|
||||
|
||||
SOURCE=.\xmltok.c
|
||||
DEP_CPP_XMLTO=\
|
||||
".\xmltok.h"\
|
||||
|
||||
|
||||
"$(INTDIR)\xmltok.obj" : $(SOURCE) $(DEP_CPP_XMLTO) "$(INTDIR)"
|
||||
|
||||
|
||||
# End Source File
|
||||
################################################################################
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\wxmltok.c
|
||||
|
||||
!IF "$(CFG)" == "xmltok - Win32 Release"
|
||||
|
||||
DEP_CPP_WXMLT=\
|
||||
".\xmltok.c"\
|
||||
DEP_CPP_XMLTO=\
|
||||
".\asciitab.h"\
|
||||
".\latin1tab.h"\
|
||||
".\nametab.h"\
|
||||
".\utf8tab.h"\
|
||||
".\xmltok.h"\
|
||||
".\xmltok_impl.c"\
|
||||
".\xmltok_impl.h"\
|
||||
|
||||
# ADD CPP /Ob2
|
||||
|
||||
"$(INTDIR)\wxmltok.obj" : $(SOURCE) $(DEP_CPP_WXMLT) "$(INTDIR)" ".\xmltok.c"
|
||||
"$(INTDIR)\xmltok.obj" : $(SOURCE) $(DEP_CPP_XMLTO) "$(INTDIR)"
|
||||
$(CPP) /nologo /MT /W3 /GX /O2 /Ob2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS"\
|
||||
/Fp"$(INTDIR)/xmltok.pch" /YX /Fo"$(INTDIR)/" /c $(SOURCE)
|
||||
|
||||
|
||||
!ELSEIF "$(CFG)" == "xmltok - Win32 Debug"
|
||||
|
||||
DEP_CPP_WXMLT=\
|
||||
".\xmltok.c"\
|
||||
DEP_CPP_XMLTO=\
|
||||
".\asciitab.h"\
|
||||
".\latin1tab.h"\
|
||||
".\nametab.h"\
|
||||
".\utf8tab.h"\
|
||||
".\xmltok.h"\
|
||||
".\xmltok_impl.c"\
|
||||
".\xmltok_impl.h"\
|
||||
|
||||
|
||||
"$(INTDIR)\wxmltok.obj" : $(SOURCE) $(DEP_CPP_WXMLT) "$(INTDIR)" ".\xmltok.c"
|
||||
"$(INTDIR)\xmltok.obj" : $(SOURCE) $(DEP_CPP_XMLTO) "$(INTDIR)"
|
||||
$(CPP) /nologo /MTd /W3 /Gm /GX /Zi /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS"\
|
||||
/Fp"$(INTDIR)/xmltok.pch" /YX /Fo"$(INTDIR)/" /Fd"$(INTDIR)/" /c $(SOURCE)
|
||||
|
||||
|
||||
!ENDIF
|
||||
|
@ -451,6 +450,9 @@ DEP_CPP_WXMLT=\
|
|||
# Begin Source File
|
||||
|
||||
SOURCE=.\xmlec\xmlec.c
|
||||
|
||||
!IF "$(CFG)" == "xmlec - Win32 Release"
|
||||
|
||||
DEP_CPP_XMLEC=\
|
||||
".\xmltok.h"\
|
||||
|
||||
|
@ -459,6 +461,18 @@ DEP_CPP_XMLEC=\
|
|||
$(CPP) $(CPP_PROJ) $(SOURCE)
|
||||
|
||||
|
||||
!ELSEIF "$(CFG)" == "xmlec - Win32 Debug"
|
||||
|
||||
DEP_CPP_XMLEC=\
|
||||
".\xmltok.h"\
|
||||
|
||||
|
||||
"$(INTDIR)\xmlec.obj" : $(SOURCE) $(DEP_CPP_XMLEC) "$(INTDIR)"
|
||||
$(CPP) $(CPP_PROJ) $(SOURCE)
|
||||
|
||||
|
||||
!ENDIF
|
||||
|
||||
# End Source File
|
||||
# End Target
|
||||
# End Project
|
||||
|
|
|
@ -11,7 +11,8 @@
|
|||
#include <windows.h>
|
||||
|
||||
static
|
||||
int XmlSkipProlog(const char **s, const char *end, const char **nextTokP);
|
||||
int XmlSkipProlog(const char **s, const char *end, const char **nextTokP,
|
||||
const ENCODING **enc);
|
||||
|
||||
int XmlParse(const char *s, size_t n, const char *filename)
|
||||
{
|
||||
|
@ -19,7 +20,8 @@ int XmlParse(const char *s, size_t n, const char *filename)
|
|||
const char *start = s;
|
||||
const char *end = s + n;
|
||||
const char *next;
|
||||
int tok = XmlSkipProlog(&s, end, &next);
|
||||
const ENCODING *enc;
|
||||
int tok = XmlSkipProlog(&s, end, &next, &enc);
|
||||
for (;;) {
|
||||
switch (tok) {
|
||||
case XML_TOK_NONE:
|
||||
|
@ -31,12 +33,15 @@ int XmlParse(const char *s, size_t n, const char *filename)
|
|||
return 1;
|
||||
case XML_TOK_INVALID:
|
||||
fprintf(stderr, "%s: well-formedness error at byte %lu\n",
|
||||
filename, (unsigned long)(s - start));
|
||||
filename, (unsigned long)(next - start));
|
||||
return 0;
|
||||
case XML_TOK_PARTIAL:
|
||||
fprintf(stderr, "%s: unclosed token started at byte %lu\n",
|
||||
filename, (unsigned long)(s - start));
|
||||
return 0;
|
||||
case XML_TOK_PARTIAL_CHAR:
|
||||
fprintf(stderr, "%s: malformed input\n", filename);
|
||||
return 0;
|
||||
case XML_TOK_COMMENT:
|
||||
break;
|
||||
case XML_TOK_START_TAG:
|
||||
|
@ -46,22 +51,25 @@ int XmlParse(const char *s, size_t n, const char *filename)
|
|||
break;
|
||||
}
|
||||
s = next;
|
||||
tok = XmlContentTok(s, end, &next);
|
||||
tok = XmlContentTok(enc, s, end, &next);
|
||||
}
|
||||
/* not reached */
|
||||
}
|
||||
|
||||
static
|
||||
int XmlSkipProlog(const char **startp, const char *end, const char **nextTokP)
|
||||
int XmlSkipProlog(const char **startp, const char *end,
|
||||
const char **nextTokP, const ENCODING **enc)
|
||||
{
|
||||
const char *s = *startp;
|
||||
INIT_ENCODING initEnc;
|
||||
XmlInitEncoding(&initEnc, enc);
|
||||
for (;;) {
|
||||
int tok = XmlPrologTok(s, end, nextTokP);
|
||||
int tok = XmlPrologTok(*enc, s, end, nextTokP);
|
||||
switch (tok) {
|
||||
case XML_TOK_NONE:
|
||||
case XML_TOK_INVALID:
|
||||
case XML_TOK_PARTIAL:
|
||||
case XML_TOK_START_TAG:
|
||||
case XML_TOK_INVALID:
|
||||
case XML_TOK_NONE:
|
||||
case XML_TOK_PARTIAL:
|
||||
*startp = s;
|
||||
return tok;
|
||||
default:
|
||||
|
@ -122,9 +130,11 @@ struct XmlTokBuffer {
|
|||
char *ptr;
|
||||
size_t size;
|
||||
int fd;
|
||||
int doneProlog;
|
||||
int state;
|
||||
int eof;
|
||||
unsigned long endOffset;
|
||||
const ENCODING *enc;
|
||||
INIT_ENCODING initEnc;
|
||||
};
|
||||
|
||||
#define XmlTokBufferOffset(tb) ((tb)->endOffset - ((tb)->end - (tb)->ptr))
|
||||
|
@ -145,9 +155,10 @@ void XmlTokBufferInit(struct XmlTokBuffer *tb, int fd)
|
|||
tb->ptr = tb->buf;
|
||||
tb->size = READSIZE;
|
||||
tb->fd = fd;
|
||||
tb->doneProlog = 0;
|
||||
tb->state = XML_PROLOG_STATE;
|
||||
tb->eof = 0;
|
||||
tb->endOffset = 0;
|
||||
XmlInitEncoding(&(tb->initEnc), &(tb->enc));
|
||||
}
|
||||
|
||||
void XmlTokBufferFree(struct XmlTokBuffer *tb)
|
||||
|
@ -161,14 +172,10 @@ int XmlGetToken(struct XmlTokBuffer *tb, const char **tokStart, size_t *tokLengt
|
|||
for (;;) {
|
||||
int nBytes;
|
||||
const char *start = tb->ptr;
|
||||
if (!tb->doneProlog) {
|
||||
tok = XmlPrologTok(start, tb->end, &tb->ptr);
|
||||
if (tok == XML_TOK_START_TAG)
|
||||
tb->doneProlog = 1;
|
||||
}
|
||||
else
|
||||
tok = XmlContentTok(start, tb->end, &tb->ptr);
|
||||
tok = XmlTok(tb->enc, tb->state, start, tb->end, &tb->ptr);
|
||||
if (tok >= 0) {
|
||||
if (tok == XML_TOK_START_TAG)
|
||||
tb->state = XML_CONTENT_STATE;
|
||||
*tokStart = start;
|
||||
*tokLength = tb->ptr - start;
|
||||
break;
|
||||
|
@ -275,6 +282,7 @@ int main(int argc, char **argv)
|
|||
fprintf(stderr, "usage: %s filename ...\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "version 0.1\n");
|
||||
for (i = 1; i < argc; i++)
|
||||
if (!doFile(argv[i]))
|
||||
ret = 1;
|
||||
|
|
|
@ -1,389 +1,221 @@
|
|||
/* TODO
|
||||
|
||||
Provide methods to convert to any of UTF-8, UTF-18, UCS-4.
|
||||
|
||||
Better prolog tokenization
|
||||
|
||||
<!NAME
|
||||
NMTOKEN
|
||||
NAME
|
||||
PEREF
|
||||
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define XMLTOKAPI __declspec(dllexport)
|
||||
#endif
|
||||
|
||||
#include "xmltok.h"
|
||||
#include "nametab.h"
|
||||
|
||||
#ifdef UNICODE
|
||||
typedef wchar_t TCHAR;
|
||||
#else
|
||||
typedef char TCHAR;
|
||||
#endif
|
||||
#define UCS2_GET_NAMING(pages, hi, lo) \
|
||||
(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
|
||||
|
||||
#define DIGIT_CASES \
|
||||
case '0': case '1': case '2': case '3': case '4': \
|
||||
case '5': case '6': case '7': case '8': case '9':
|
||||
/* A 2 byte UTF-8 representation splits the characters 11 bits
|
||||
between the bottom 5 and 6 bits of the bytes.
|
||||
We need 8 bits to index into pages, 3 bits to add to that index and
|
||||
5 bits to generate the mask. */
|
||||
#define UTF8_GET_NAMING2(pages, byte) \
|
||||
(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
|
||||
+ ((((byte)[0]) & 3) << 1) \
|
||||
+ ((((byte)[1]) >> 5) & 1)] \
|
||||
& (1 << (((byte)[1]) & 0x1F)))
|
||||
|
||||
#define HEX_DIGIT_CASES DIGIT_CASES \
|
||||
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': \
|
||||
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
|
||||
/* A 3 byte UTF-8 representation splits the characters 16 bits
|
||||
between the bottom 4, 6 and 6 bits of the bytes.
|
||||
We need 8 bits to index into pages, 3 bits to add to that index and
|
||||
5 bits to generate the mask. */
|
||||
#define UTF8_GET_NAMING3(pages, byte) \
|
||||
(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
|
||||
+ ((((byte)[1]) >> 2) & 0xF)] \
|
||||
<< 3) \
|
||||
+ ((((byte)[1]) & 3) << 1) \
|
||||
+ ((((byte)[2]) >> 5) & 1)] \
|
||||
& (1 << (((byte)[2]) & 0x1F)))
|
||||
|
||||
#define S_CASES case ' ': case '\t': case '\r': case '\n':
|
||||
#define UTF8_GET_NAMING(pages, p, n) \
|
||||
((n) == 2 \
|
||||
? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
|
||||
: ((n) == 3 \
|
||||
? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
|
||||
: 0))
|
||||
|
||||
/* ptr points to character following "<!-" */
|
||||
|
||||
static
|
||||
int scanComment(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
|
||||
#include "xmltok_impl.h"
|
||||
|
||||
struct normal_encoding {
|
||||
ENCODING enc;
|
||||
unsigned char type[256];
|
||||
};
|
||||
|
||||
/* minimum bytes per character */
|
||||
#define MINBPC 1
|
||||
#define BYTE_TYPE(enc, p) \
|
||||
(((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
|
||||
#define IS_NAME_CHAR(enc, p, n) UTF8_GET_NAMING(namePages, p, n)
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) UTF8_GET_NAMING(nmstrtPages, p, n)
|
||||
|
||||
/* c is an ASCII character */
|
||||
#define CHAR_MATCHES(enc, p, c) (*(p) == c)
|
||||
|
||||
#define PREFIX(ident) normal_ ## ident
|
||||
#include "xmltok_impl.c"
|
||||
|
||||
#undef MINBPC
|
||||
#undef BYTE_TYPE
|
||||
#undef CHAR_MATCHES
|
||||
#undef IS_NAME_CHAR
|
||||
#undef IS_NMSTRT_CHAR
|
||||
|
||||
const struct normal_encoding utf8_encoding = {
|
||||
{ { PREFIX(prologTok), PREFIX(contentTok) }, 1 },
|
||||
#include "asciitab.h"
|
||||
#include "utf8tab.h"
|
||||
};
|
||||
|
||||
#undef PREFIX
|
||||
|
||||
static unsigned char latin1tab[256] = {
|
||||
#include "asciitab.h"
|
||||
#include "latin1tab.h"
|
||||
};
|
||||
|
||||
static int unicode_byte_type(char hi, char lo)
|
||||
{
|
||||
if (ptr != end) {
|
||||
if (*ptr != '-') {
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
for (++ptr; ptr != end; ptr++) {
|
||||
if (*ptr == '-') {
|
||||
if (++ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
if (*ptr == '-') {
|
||||
if (++ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
if (*ptr != '>') {
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
*nextTokPtr = ptr + 1;
|
||||
return XML_TOK_COMMENT;
|
||||
}
|
||||
}
|
||||
switch ((unsigned char)hi) {
|
||||
case 0xD8: case 0xD9: case 0xDA: case 0xDB:
|
||||
return BT_LEAD4;
|
||||
case 0xDC: case 0xDD: case 0xDE: case 0xDF:
|
||||
return BT_TRAIL;
|
||||
case 0xFF:
|
||||
switch ((unsigned char)lo) {
|
||||
case 0xFF:
|
||||
case 0xFE:
|
||||
return BT_NONXML;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
return BT_NONASCII;
|
||||
}
|
||||
|
||||
/* ptr points to character following "<!" */
|
||||
#define PREFIX(ident) little2_ ## ident
|
||||
#define MINBPC 2
|
||||
#define BYTE_TYPE(enc, p) \
|
||||
((p)[1] == 0 ? latin1tab[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0]))
|
||||
#define CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
|
||||
#define IS_NAME_CHAR(enc, p, n) \
|
||||
UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) \
|
||||
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
|
||||
|
||||
#include "xmltok_impl.c"
|
||||
|
||||
#undef MINBPC
|
||||
#undef BYTE_TYPE
|
||||
#undef CHAR_MATCHES
|
||||
#undef IS_NAME_CHAR
|
||||
#undef IS_NMSTRT_CHAR
|
||||
|
||||
const struct encoding little2_encoding = {
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, 2
|
||||
};
|
||||
|
||||
#undef PREFIX
|
||||
|
||||
#define PREFIX(ident) big2_ ## ident
|
||||
#define MINBPC 2
|
||||
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
|
||||
#define BYTE_TYPE(enc, p) \
|
||||
((p)[0] == 0 ? latin1tab[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1]))
|
||||
#define CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
|
||||
#define IS_NAME_CHAR(enc, p, n) \
|
||||
UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) \
|
||||
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
|
||||
|
||||
#include "xmltok_impl.c"
|
||||
|
||||
#undef MINBPC
|
||||
#undef BYTE_TYPE
|
||||
#undef CHAR_MATCHES
|
||||
#undef IS_NAME_CHAR
|
||||
#undef IS_NMSTRT_CHAR
|
||||
|
||||
const struct encoding big2_encoding = {
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, 2
|
||||
};
|
||||
|
||||
#undef PREFIX
|
||||
|
||||
static
|
||||
int scanDecl(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
|
||||
int initScan(const ENCODING *enc, int state, const char *ptr, const char *end,
|
||||
const char **nextTokPtr)
|
||||
{
|
||||
if (ptr != end) {
|
||||
if (*ptr == '-')
|
||||
return scanComment(ptr + 1, end, nextTokPtr);
|
||||
do {
|
||||
switch (*ptr) {
|
||||
case '\'':
|
||||
case '"':
|
||||
case '<':
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_PROLOG_CHARS;
|
||||
}
|
||||
} while (++ptr != end);
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_PROLOG_CHARS;
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
}
|
||||
const ENCODING **encPtr;
|
||||
|
||||
/* ptr points to character following "<?" */
|
||||
|
||||
static
|
||||
int scanPi(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
|
||||
{
|
||||
for (; ptr != end; ++ptr) {
|
||||
switch (*ptr) {
|
||||
case '?':
|
||||
if (ptr + 1 == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
if (ptr[1] == '>') {
|
||||
*nextTokPtr = ptr + 2;
|
||||
return XML_TOK_PI;
|
||||
}
|
||||
if (ptr == end)
|
||||
return XML_TOK_NONE;
|
||||
encPtr = ((const INIT_ENCODING *)enc)->encPtr;
|
||||
if (ptr + 1 == end) {
|
||||
switch ((unsigned char)*ptr) {
|
||||
case 0xFE:
|
||||
case 0xFF:
|
||||
case 0x00:
|
||||
case 0x3C:
|
||||
return XML_TOK_PARTIAL;
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
}
|
||||
|
||||
/* ptr points to character following "<" */
|
||||
|
||||
static
|
||||
int scanStartTag(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
|
||||
{
|
||||
for (; ptr != end; ++ptr) {
|
||||
switch (*ptr) {
|
||||
case '<':
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
case '>':
|
||||
*nextTokPtr = ptr + 1;
|
||||
return XML_TOK_START_TAG;
|
||||
case '"':
|
||||
for (++ptr;; ++ptr) {
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
if (*ptr == '"')
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case '\'':
|
||||
for (++ptr;; ++ptr) {
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
if (*ptr == '\'')
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case '/':
|
||||
if (++ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
if (*ptr != '>') {
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
*nextTokPtr = ptr + 1;
|
||||
return XML_TOK_EMPTY_ELEMENT;
|
||||
else {
|
||||
switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
|
||||
case 0x003C:
|
||||
*encPtr = &big2_encoding;
|
||||
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
||||
case 0xFEFF:
|
||||
*nextTokPtr = ptr + 2;
|
||||
*encPtr = &big2_encoding;
|
||||
return XML_TOK_BOM;
|
||||
case 0x3C00:
|
||||
*encPtr = &little2_encoding;
|
||||
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
||||
case 0xFFFE:
|
||||
*nextTokPtr = ptr + 2;
|
||||
*encPtr = &little2_encoding;
|
||||
return XML_TOK_BOM;
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
}
|
||||
|
||||
/* ptr points to character following "</" */
|
||||
|
||||
static
|
||||
int scanEndTag(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
|
||||
{
|
||||
for (; ptr != end; ++ptr) {
|
||||
switch (*ptr) {
|
||||
case '<':
|
||||
case '&':
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
case '>':
|
||||
*nextTokPtr = ptr + 1;
|
||||
return XML_TOK_END_TAG;
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
}
|
||||
|
||||
/* ptr points to character following "&#X" */
|
||||
|
||||
static
|
||||
int scanHexCharRef(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
|
||||
{
|
||||
if (ptr != end) {
|
||||
switch (*ptr) {
|
||||
HEX_DIGIT_CASES
|
||||
break;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
for (++ptr; ptr != end; ++ptr) {
|
||||
switch (*ptr) {
|
||||
HEX_DIGIT_CASES
|
||||
break;
|
||||
case ';':
|
||||
*nextTokPtr = ptr + 1;
|
||||
return XML_TOK_CHAR_REF;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
}
|
||||
|
||||
/* ptr points to character following "&#" */
|
||||
|
||||
static
|
||||
int scanCharRef(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
|
||||
{
|
||||
if (ptr != end) {
|
||||
switch (*ptr) {
|
||||
case 'x':
|
||||
case 'X':
|
||||
return scanHexCharRef(ptr + 1, end, nextTokPtr);
|
||||
DIGIT_CASES
|
||||
break;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
for (++ptr; ptr != end; ++ptr) {
|
||||
switch (*ptr) {
|
||||
DIGIT_CASES
|
||||
break;
|
||||
case ';':
|
||||
*nextTokPtr = ptr + 1;
|
||||
return XML_TOK_CHAR_REF;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
*encPtr = &utf8_encoding.enc;
|
||||
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
||||
}
|
||||
|
||||
static
|
||||
int scanEntityRef(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
|
||||
int initScanProlog(const ENCODING *enc, const char *ptr, const char *end,
|
||||
const char **nextTokPtr)
|
||||
{
|
||||
for (; ptr != end; ++ptr) {
|
||||
switch (*ptr) {
|
||||
case '<':
|
||||
case '>':
|
||||
case '&':
|
||||
S_CASES
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
case ';':
|
||||
*nextTokPtr = ptr + 1;
|
||||
return XML_TOK_ENTITY_REF;
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
return initScan(enc, XML_PROLOG_STATE, ptr, end, nextTokPtr);
|
||||
}
|
||||
|
||||
/* ptr points to character following "<![" */
|
||||
|
||||
static
|
||||
int scanCdataSection(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
|
||||
int initScanContent(const ENCODING *enc, const char *ptr, const char *end,
|
||||
const char **nextTokPtr)
|
||||
{
|
||||
int i;
|
||||
/* CDATA[]]> */
|
||||
if (end - ptr < 9)
|
||||
return XML_TOK_PARTIAL;
|
||||
for (i = 0; i < 6; i++, ptr++) {
|
||||
if (*ptr != "CDATA["[i]) {
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
}
|
||||
end -= 2;
|
||||
for (; ptr != end; ++ptr) {
|
||||
if (*ptr == ']') {
|
||||
if (ptr[1] == ']' && ptr[2] == '>') {
|
||||
*nextTokPtr = ptr + 3;
|
||||
return XML_TOK_CDATA_SECTION;
|
||||
}
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
|
||||
return initScan(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr);
|
||||
}
|
||||
|
||||
int XmlContentTok(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
|
||||
void XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr)
|
||||
{
|
||||
if (ptr != end) {
|
||||
switch (*ptr) {
|
||||
case '<':
|
||||
{
|
||||
++ptr;
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
switch (*ptr) {
|
||||
case '!':
|
||||
if (++ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
switch (*ptr) {
|
||||
case '-':
|
||||
return scanComment(ptr + 1, end, nextTokPtr);
|
||||
case '[':
|
||||
return scanCdataSection(ptr + 1, end, nextTokPtr);
|
||||
}
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
case '?':
|
||||
return scanPi(ptr + 1, end, nextTokPtr);
|
||||
case '/':
|
||||
return scanEndTag(ptr + 1, end, nextTokPtr);
|
||||
case '>':
|
||||
S_CASES
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
default:
|
||||
return scanStartTag(ptr, end, nextTokPtr);
|
||||
}
|
||||
}
|
||||
case '&':
|
||||
{
|
||||
++ptr;
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
switch (*ptr) {
|
||||
case '#':
|
||||
return scanCharRef(ptr + 1, end, nextTokPtr);
|
||||
S_CASES
|
||||
case ';':
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
return scanEntityRef(ptr + 1, end, nextTokPtr);
|
||||
}
|
||||
default:
|
||||
{
|
||||
for (++ptr; ptr != end; ++ptr) {
|
||||
switch (*ptr) {
|
||||
case '&':
|
||||
case '<':
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_DATA_CHARS;
|
||||
}
|
||||
}
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_DATA_CHARS;
|
||||
}
|
||||
}
|
||||
}
|
||||
return XML_TOK_NONE;
|
||||
}
|
||||
|
||||
int XmlPrologTok(const TCHAR *ptr, const TCHAR *end, const TCHAR **nextTokPtr)
|
||||
{
|
||||
if (ptr != end) {
|
||||
switch (*ptr) {
|
||||
case '"':
|
||||
{
|
||||
for (++ptr; ptr != end; ++ptr) {
|
||||
if (*ptr == '"') {
|
||||
*nextTokPtr = ptr + 1;
|
||||
return XML_TOK_LITERAL;
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
}
|
||||
case '\'':
|
||||
{
|
||||
for (++ptr; ptr != end; ++ptr) {
|
||||
if (*ptr == '\'') {
|
||||
*nextTokPtr = ptr + 1;
|
||||
return XML_TOK_LITERAL;
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
}
|
||||
case '<':
|
||||
{
|
||||
++ptr;
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
switch (*ptr) {
|
||||
case '!':
|
||||
return scanDecl(ptr + 1, end, nextTokPtr);
|
||||
case '?':
|
||||
return scanPi(ptr + 1, end, nextTokPtr);
|
||||
case '/':
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
default:
|
||||
return XmlContentTok(ptr - 1, end, nextTokPtr);
|
||||
}
|
||||
}
|
||||
default:
|
||||
{
|
||||
for (++ptr; ptr != end; ++ptr) {
|
||||
switch (*ptr) {
|
||||
case '<':
|
||||
case '"':
|
||||
case '\'':
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_PROLOG_CHARS;
|
||||
}
|
||||
}
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_PROLOG_CHARS;
|
||||
}
|
||||
}
|
||||
}
|
||||
return XML_TOK_NONE;
|
||||
p->initEnc.scanners[XML_PROLOG_STATE] = initScanProlog;
|
||||
p->initEnc.scanners[XML_CONTENT_STATE] = initScanContent;
|
||||
p->initEnc.minBytesPerChar = 1;
|
||||
p->encPtr = encPtr;
|
||||
*encPtr = &(p->initEnc);
|
||||
}
|
||||
|
|
|
@ -1,40 +1,53 @@
|
|||
#ifndef XmlTok_INCLUDED
|
||||
#define XmlTok_INCLUDED 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef XMLTOKAPI
|
||||
#define XMLTOKAPI /* as nothing */
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
/* The following tokens may be returned by both XmlPrologTok and XmlContentTok */
|
||||
#define XML_TOK_NONE -2 /* The string to be scanned is empty */
|
||||
#define XML_TOK_PARTIAL -1
|
||||
#define XML_TOK_NONE -3 /* The string to be scanned is empty */
|
||||
#define XML_TOK_PARTIAL_CHAR -2 /* only part of a multibyte sequence */
|
||||
#define XML_TOK_PARTIAL -1 /* only part of a token */
|
||||
#define XML_TOK_INVALID 0
|
||||
#define XML_TOK_COMMENT 1
|
||||
#define XML_TOK_PI 2 /* processing instruction */
|
||||
#define XML_TOK_BOM 1 /* Byte order mark */
|
||||
#define XML_TOK_COMMENT 2
|
||||
#define XML_TOK_PI 3 /* processing instruction */
|
||||
|
||||
/* The following tokens are returned only by XmlPrologTok */
|
||||
#define XML_TOK_LITERAL 3
|
||||
#define XML_TOK_PROLOG_CHARS 4
|
||||
#define XML_TOK_LITERAL 4
|
||||
#define XML_TOK_PROLOG_CHARS 5
|
||||
#define XML_TOK_PROLOG_S 6
|
||||
|
||||
/* The following token is returned by XmlPrologTok when it detects the end
|
||||
of the prolog and is also returned by XmlContentTok */
|
||||
|
||||
#define XML_TOK_START_TAG 5
|
||||
#define XML_TOK_START_TAG 7
|
||||
|
||||
/* The following tokens are returned only by XmlContentTok */
|
||||
|
||||
#define XML_TOK_END_TAG 6
|
||||
#define XML_TOK_EMPTY_ELEMENT 7 /* empty element tag <e/> */
|
||||
#define XML_TOK_DATA_CHARS 8
|
||||
#define XML_TOK_CDATA_SECTION 9
|
||||
#define XML_TOK_ENTITY_REF 10
|
||||
#define XML_TOK_CHAR_REF 11 /* numeric character reference */
|
||||
#define XML_TOK_END_TAG 8
|
||||
#define XML_TOK_EMPTY_ELEMENT 9 /* empty element tag <e/> */
|
||||
#define XML_TOK_DATA_CHARS 10
|
||||
#define XML_TOK_CDATA_SECTION 11
|
||||
#define XML_TOK_ENTITY_REF 12
|
||||
#define XML_TOK_CHAR_REF 13 /* numeric character reference */
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#define XML_NSTATES 2
|
||||
#define XML_PROLOG_STATE 0
|
||||
#define XML_CONTENT_STATE 1
|
||||
|
||||
typedef struct encoding {
|
||||
int (*scanners[XML_NSTATES])(const struct encoding *,
|
||||
const char *,
|
||||
const char *,
|
||||
const char **);
|
||||
int minBytesPerChar;
|
||||
} ENCODING;
|
||||
|
||||
/*
|
||||
Scan the string starting at ptr until the end of the next complete token,
|
||||
|
@ -56,30 +69,25 @@ may be returned together. Similarly for characters in the prolog outside
|
|||
literals, comments and processing instructions.
|
||||
*/
|
||||
|
||||
int XMLTOKAPI XmlPrologTokA(const char *ptr,
|
||||
const char *eptr,
|
||||
const char **nextTokPtr);
|
||||
int XMLTOKAPI XmlContentTokA(const char *ptr,
|
||||
const char *eptr,
|
||||
const char **nextTokPtr);
|
||||
|
||||
int XMLTOKAPI XmlPrologTokW(const wchar_t *ptr,
|
||||
const wchar_t *eptr,
|
||||
const wchar_t **nextTokPtr);
|
||||
int XMLTOKAPI XmlContentTokW(const wchar_t *ptr,
|
||||
const wchar_t *eptr,
|
||||
const wchar_t **nextTokPtr);
|
||||
#define XmlTok(enc, state, ptr, end, nextTokPtr) \
|
||||
(((enc)->scanners[state])(enc, ptr, end, nextTokPtr))
|
||||
|
||||
#define XmlPrologTok(enc, ptr, end, nextTokPtr) \
|
||||
XmlTok(enc, XML_PROLOG_STATE, ptr, end, nextTokPtr)
|
||||
|
||||
#define XmlContentTok(enc, ptr, end, nextTokPtr) \
|
||||
XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
|
||||
|
||||
typedef struct {
|
||||
ENCODING initEnc;
|
||||
const ENCODING **encPtr;
|
||||
} INIT_ENCODING;
|
||||
|
||||
void XMLTOKAPI XmlInitEncoding(INIT_ENCODING *, const ENCODING **);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef UNICODE
|
||||
#define XmlPrologTok XmlPrologTokW
|
||||
#define XmlContentTok XmlContentTokW
|
||||
#else
|
||||
#define XmlPrologTok XmlPrologTokA
|
||||
#define XmlContentTok XmlContentTokA
|
||||
#endif
|
||||
|
||||
#endif /* not XmlTok_INCLUDED */
|
||||
|
|
Loading…
Add table
Reference in a new issue