mirror of
https://github.com/libexpat/libexpat.git
synced 2025-04-05 05:05:00 +00:00
Realistic prolog tokenization.
This commit is contained in:
parent
7fc8cdf625
commit
9501138aae
8 changed files with 307 additions and 62 deletions
|
@ -7,9 +7,9 @@
|
|||
/* 0x18 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
|
||||
/* 0x1C */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
|
||||
/* 0x20 */ BT_S, BT_EXCL, BT_QUOT, BT_NUM,
|
||||
/* 0x24 */ BT_OTHER, BT_OTHER, BT_AMP, BT_APOS,
|
||||
/* 0x28 */ BT_OTHER, BT_OTHER, BT_OTHER, BT_OTHER,
|
||||
/* 0x2C */ BT_OTHER, BT_MINUS, BT_NAME, BT_SOL,
|
||||
/* 0x24 */ BT_OTHER, BT_PERCNT, BT_AMP, BT_APOS,
|
||||
/* 0x28 */ BT_LPAR, BT_RPAR, BT_AST, BT_PLUS,
|
||||
/* 0x2C */ BT_COMMA, BT_MINUS, BT_NAME, BT_SOL,
|
||||
/* 0x30 */ BT_DIGIT, BT_DIGIT, BT_DIGIT, BT_DIGIT,
|
||||
/* 0x34 */ BT_DIGIT, BT_DIGIT, BT_DIGIT, BT_DIGIT,
|
||||
/* 0x38 */ BT_DIGIT, BT_DIGIT, BT_NMSTRT, BT_SEMI,
|
||||
|
@ -29,4 +29,4 @@
|
|||
/* 0x70 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
|
||||
/* 0x74 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
|
||||
/* 0x78 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_OTHER,
|
||||
/* 0x7C */ BT_OTHER, BT_OTHER, BT_OTHER, BT_OTHER,
|
||||
/* 0x7C */ BT_VERBAR, BT_OTHER, BT_OTHER, BT_OTHER,
|
||||
|
|
|
@ -1,18 +1,3 @@
|
|||
/* TODO
|
||||
|
||||
Provide method to get name length.
|
||||
|
||||
Provide methods to convert to any of UTF-8, UTF-18, UCS-4.
|
||||
|
||||
Tokenize prologs in a way useful for well-formedness checking
|
||||
|
||||
<!NAME
|
||||
NMTOKEN
|
||||
NAME
|
||||
PEREF
|
||||
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define XMLTOKAPI __declspec(dllexport)
|
||||
#endif
|
||||
|
@ -80,7 +65,7 @@ struct normal_encoding {
|
|||
#undef IS_NMSTRT_CHAR
|
||||
|
||||
const struct normal_encoding utf8_encoding = {
|
||||
{ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 1 },
|
||||
{ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii), PREFIX(getAtts), PREFIX(updatePosition), 1 },
|
||||
#include "asciitab.h"
|
||||
#include "utf8tab.h"
|
||||
};
|
||||
|
@ -129,7 +114,7 @@ static int unicode_byte_type(char hi, char lo)
|
|||
#undef IS_NMSTRT_CHAR
|
||||
|
||||
const struct encoding little2_encoding = {
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 2
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii), PREFIX(getAtts), PREFIX(updatePosition), 2
|
||||
};
|
||||
|
||||
#undef PREFIX
|
||||
|
@ -154,7 +139,7 @@ const struct encoding little2_encoding = {
|
|||
#undef IS_NMSTRT_CHAR
|
||||
|
||||
const struct encoding big2_encoding = {
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 2
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii), PREFIX(getAtts), PREFIX(updatePosition), 2
|
||||
};
|
||||
|
||||
#undef PREFIX
|
||||
|
|
|
@ -38,8 +38,28 @@ of the prolog and is also returned by XmlContentTok */
|
|||
|
||||
/* The following tokens are returned only by XmlPrologTok */
|
||||
#define XML_TOK_LITERAL 13
|
||||
#define XML_TOK_PROLOG_CHARS 14
|
||||
#define XML_TOK_PARAM_ENTITY_REF 14
|
||||
#define XML_TOK_PROLOG_S 15
|
||||
#define XML_TOK_DECL_OPEN 16 /* <!foo */
|
||||
#define XML_TOK_DECL_CLOSE 17 /* > */
|
||||
#define XML_TOK_NAME 18
|
||||
#define XML_TOK_NMTOKEN 19
|
||||
#define XML_TOK_POUND_NAME 20 /* #name */
|
||||
#define XML_TOK_COMMA 21
|
||||
#define XML_TOK_OR 22 /* | */
|
||||
#define XML_TOK_PERCENT 23
|
||||
#define XML_TOK_OPEN_PAREN 24
|
||||
#define XML_TOK_CLOSE_PAREN 25
|
||||
#define XML_TOK_OPEN_BRACKET 26
|
||||
#define XML_TOK_CLOSE_BRACKET 27
|
||||
#define XML_TOK_CLOSE_PAREN_QUESTION 28 /* )? */
|
||||
#define XML_TOK_CLOSE_PAREN_ASTERISK 29 /* )* */
|
||||
#define XML_TOK_CLOSE_PAREN_PLUS 30 /* )+ */
|
||||
#define XML_TOK_NAME_QUESTION 31 /* name? */
|
||||
#define XML_TOK_NAME_ASTERISK 32 /* name* */
|
||||
#define XML_TOK_NAME_PLUS 33 /* name+ */
|
||||
#define XML_TOK_COND_SECT_OPEN 34 /* <![ */
|
||||
#define XML_TOK_COND_SECT_CLOSE 35 /* ]]> */
|
||||
|
||||
#define XML_NSTATES 2
|
||||
#define XML_PROLOG_STATE 0
|
||||
|
@ -61,6 +81,8 @@ typedef struct encoding {
|
|||
const char **);
|
||||
int (*sameName)(const struct encoding *,
|
||||
const char *, const char *);
|
||||
int (*nameMatchesAscii)(const struct encoding *,
|
||||
const char *, const char *);
|
||||
int (*getAtts)(const struct encoding *enc, const char *ptr,
|
||||
int attsMax, const char **atts);
|
||||
void (*updatePosition)(const struct encoding *,
|
||||
|
@ -101,6 +123,7 @@ literals, comments and processing instructions.
|
|||
XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
|
||||
|
||||
#define XmlSameName(enc, ptr1, ptr2) (((enc)->sameName)(enc, ptr1, ptr2))
|
||||
#define XmlNameMatchesAscii(enc, ptr1, ptr2) (((enc)->nameMatchesAscii)(enc, ptr1, ptr2))
|
||||
|
||||
#define XmlGetAttributes(enc, ptr, attsMax, atts) \
|
||||
(((enc)->getAtts)(enc, ptr, attsMax, atts))
|
||||
|
|
|
@ -125,22 +125,43 @@ static
|
|||
int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
|
||||
const char **nextTokPtr)
|
||||
{
|
||||
if (ptr != end) {
|
||||
if (BYTE_TYPE(enc, ptr) == BT_MINUS)
|
||||
return PREFIX(scanComment)(enc, ptr + MINBPC, end, nextTokPtr);
|
||||
do {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
MULTIBYTE_CASES(ptr, end, (*nextTokPtr = ptr, XML_TOK_PROLOG_CHARS))
|
||||
INVALID_CASES(ptr, nextTokPtr)
|
||||
case BT_APOS:
|
||||
case BT_QUOT:
|
||||
case BT_LT:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_PROLOG_CHARS;
|
||||
}
|
||||
} while ((ptr += MINBPC) != end);
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
if (CHAR_MATCHES(enc, ptr, '-'))
|
||||
return PREFIX(scanComment)(enc, ptr + MINBPC, end, nextTokPtr);
|
||||
/* FIXME check for <![ */
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
case BT_NMSTRT:
|
||||
case BT_HEX:
|
||||
ptr += MINBPC;
|
||||
break;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_PROLOG_CHARS;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
while (ptr != end) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
case BT_PERCNT:
|
||||
if (ptr + MINBPC == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
/* don't allow <!ENTITY% foo "whatever"> */
|
||||
switch (BYTE_TYPE(enc, ptr + MINBPC)) {
|
||||
case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
/* fall through */
|
||||
case BT_S: case BT_CR: case BT_LF:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_DECL_OPEN;
|
||||
case BT_NMSTRT:
|
||||
case BT_HEX:
|
||||
ptr += MINBPC;
|
||||
break;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
}
|
||||
|
@ -645,10 +666,70 @@ int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
return XML_TOK_DATA_CHARS;
|
||||
}
|
||||
|
||||
/* ptr points to character following "%" */
|
||||
|
||||
static
|
||||
int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
|
||||
const char **nextTokPtr)
|
||||
{
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr)
|
||||
case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_PERCENT;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
while (ptr != end) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
|
||||
case BT_SEMI:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_PARAM_ENTITY_REF;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
}
|
||||
|
||||
static
|
||||
int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
|
||||
const char **nextTokPtr)
|
||||
{
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr)
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
while (ptr != end) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
|
||||
case BT_CR: case BT_LF: case BT_S:
|
||||
case BT_RPAR: case BT_GT: case BT_PERCNT:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_POUND_NAME;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
}
|
||||
|
||||
|
||||
static
|
||||
int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
|
||||
const char **nextTokPtr)
|
||||
{
|
||||
int tok;
|
||||
if (ptr == end)
|
||||
return XML_TOK_NONE;
|
||||
#if MINBPC > 1
|
||||
|
@ -663,10 +744,9 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
}
|
||||
#endif
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
INVALID_CASES(ptr, nextTokPtr)
|
||||
MULTIBYTE_CASES(ptr, end, XML_TOK_PARTIAL_CHAR)
|
||||
case BT_QUOT:
|
||||
{
|
||||
/* FIXME multibyte, plus require S, > or % afterwards */
|
||||
for (ptr += MINBPC; ptr != end; ptr += MINBPC) {
|
||||
if (BYTE_TYPE(enc, ptr) == BT_QUOT) {
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
|
@ -677,6 +757,7 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
}
|
||||
case BT_APOS:
|
||||
{
|
||||
/* FIXME multibyte, plus require S, > or % afterwards */
|
||||
for (ptr += MINBPC; ptr != end; ptr += MINBPC) {
|
||||
if (BYTE_TYPE(enc, ptr) == BT_APOS) {
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
|
@ -722,29 +803,126 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
}
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_PROLOG_S;
|
||||
default:
|
||||
case BT_PERCNT:
|
||||
return PREFIX(scanPercent)(enc, ptr + MINBPC, end, nextTokPtr);
|
||||
case BT_COMMA:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_COMMA;
|
||||
case BT_LSQB:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_OPEN_BRACKET;
|
||||
case BT_RSQB:
|
||||
/* FIXME check for ]]> */
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_CLOSE_BRACKET;
|
||||
case BT_LPAR:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_OPEN_PAREN;
|
||||
case BT_RPAR:
|
||||
ptr += MINBPC;
|
||||
if (ptr == end)
|
||||
return XML_TOK_INVALID;
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
case BT_AST:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_CLOSE_PAREN_ASTERISK;
|
||||
case BT_QUEST:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_CLOSE_PAREN_QUESTION;
|
||||
case BT_PLUS:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_CLOSE_PAREN_PLUS;
|
||||
}
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_CLOSE_PAREN;
|
||||
case BT_VERBAR:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_OR;
|
||||
case BT_GT:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_DECL_CLOSE;
|
||||
case BT_NUM:
|
||||
return PREFIX(scanPoundName)(enc, ptr + MINBPC, end, nextTokPtr);
|
||||
#define LEAD_CASE(n) \
|
||||
case BT_LEAD ## n: \
|
||||
if (end - ptr < n) \
|
||||
return XML_TOK_PARTIAL_CHAR; \
|
||||
if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
|
||||
ptr += n; \
|
||||
tok = XML_TOK_NAME; \
|
||||
break; \
|
||||
} \
|
||||
if (IS_NAME_CHAR(enc, ptr, n)) { \
|
||||
ptr += n; \
|
||||
tok = XML_TOK_NMTOKEN; \
|
||||
break; \
|
||||
} \
|
||||
*nextTokPtr = ptr; \
|
||||
return XML_TOK_INVALID;
|
||||
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) LEAD_CASE(5) LEAD_CASE(6)
|
||||
#undef LEAD_CASE
|
||||
case BT_NMSTRT:
|
||||
case BT_HEX:
|
||||
tok = XML_TOK_NAME;
|
||||
ptr += MINBPC;
|
||||
break;
|
||||
}
|
||||
for (; ptr != end;) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
case BT_LT:
|
||||
case BT_QUOT:
|
||||
case BT_APOS:
|
||||
case BT_NONXML:
|
||||
case BT_MALFORM:
|
||||
case BT_TRAIL:
|
||||
case BT_S: case BT_CR: case BT_LF:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_PROLOG_CHARS;
|
||||
MULTIBYTE_CASES(ptr, end, (*nextTokPtr = ptr, XML_TOK_PROLOG_CHARS))
|
||||
default:
|
||||
case BT_DIGIT:
|
||||
case BT_NAME:
|
||||
case BT_MINUS:
|
||||
tok = XML_TOK_NMTOKEN;
|
||||
ptr += MINBPC;
|
||||
break;
|
||||
case BT_NONASCII:
|
||||
if (IS_NMSTRT_CHAR(enc, ptr, MINBPC)) {
|
||||
ptr += MINBPC;
|
||||
tok = XML_TOK_NAME;
|
||||
break;
|
||||
}
|
||||
if (IS_NAME_CHAR(enc, ptr, MINBPC)) {
|
||||
ptr += MINBPC;
|
||||
tok = XML_TOK_NMTOKEN;
|
||||
break;
|
||||
}
|
||||
/* fall through */
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_PROLOG_CHARS;
|
||||
while (ptr != end) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
|
||||
case BT_GT: case BT_RPAR: case BT_COMMA:
|
||||
case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
|
||||
case BT_S: case BT_CR: case BT_LF:
|
||||
*nextTokPtr = ptr;
|
||||
return tok;
|
||||
case BT_PLUS:
|
||||
if (tok != XML_TOK_NAME) {
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_NAME_PLUS;
|
||||
case BT_AST:
|
||||
if (tok != XML_TOK_NAME) {
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_NAME_ASTERISK;
|
||||
case BT_QUEST:
|
||||
if (tok != XML_TOK_NAME) {
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_NAME_QUESTION;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
}
|
||||
|
||||
/* This must only be called for a well-formed start-tag or empty element tag.
|
||||
|
@ -861,6 +1039,31 @@ int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
|
|||
/* not reached */
|
||||
}
|
||||
|
||||
static
|
||||
int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, const char *ptr2)
|
||||
{
|
||||
for (; *ptr2; ptr1 += MINBPC, ptr2++) {
|
||||
if (!CHAR_MATCHES(end, ptr1, *ptr2))
|
||||
return 0;
|
||||
}
|
||||
switch (BYTE_TYPE(enc, ptr1)) {
|
||||
case BT_LEAD2:
|
||||
case BT_LEAD3:
|
||||
case BT_LEAD4:
|
||||
case BT_LEAD5:
|
||||
case BT_LEAD6:
|
||||
case BT_NONASCII:
|
||||
case BT_NMSTRT:
|
||||
case BT_HEX:
|
||||
case BT_DIGIT:
|
||||
case BT_NAME:
|
||||
case BT_MINUS:
|
||||
return 0;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void PREFIX(updatePosition)(const ENCODING *enc,
|
||||
const char *ptr,
|
||||
|
|
|
@ -29,7 +29,14 @@ enum {
|
|||
BT_NAME,
|
||||
BT_MINUS,
|
||||
BT_OTHER, /* known not to be a name or name start character */
|
||||
BT_NONASCII /* might be a name or name start character */
|
||||
BT_NONASCII, /* might be a name or name start character */
|
||||
BT_PERCNT,
|
||||
BT_LPAR,
|
||||
BT_RPAR,
|
||||
BT_AST,
|
||||
BT_PLUS,
|
||||
BT_COMMA,
|
||||
BT_VERBAR
|
||||
};
|
||||
|
||||
#include <stddef.h>
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
static
|
||||
int skipProlog(const char **s, const char *end, const char **nextTokP,
|
||||
const ENCODING **enc);
|
||||
const ENCODING **enc, const char **doctypeP);
|
||||
static
|
||||
void setPosition(const ENCODING *enc,
|
||||
const char *start, const char *end,
|
||||
|
@ -26,6 +26,7 @@ wfCheck(const char *s, size_t n,
|
|||
const char *end = s + n;
|
||||
const char *next;
|
||||
const ENCODING *enc;
|
||||
const char *doctype = 0;
|
||||
size_t stackSize = 1024;
|
||||
size_t level = 0;
|
||||
int tok;
|
||||
|
@ -35,7 +36,7 @@ wfCheck(const char *s, size_t n,
|
|||
#define RETURN_CLEANUP(n) return (free((void *)startName), free((void *)atts), (n))
|
||||
if (!startName)
|
||||
return noMemory;
|
||||
tok = skipProlog(&s, end, &next, &enc);
|
||||
tok = skipProlog(&s, end, &next, &enc, &doctype);
|
||||
for (;;) {
|
||||
switch (tok) {
|
||||
case XML_TOK_NONE:
|
||||
|
@ -114,6 +115,10 @@ wfCheck(const char *s, size_t n,
|
|||
tok = XmlPrologTok(enc, s, end, &next);
|
||||
switch (tok) {
|
||||
case XML_TOK_NONE:
|
||||
if (doctype) {
|
||||
setPosition(enc, start, doctype, badPtr, badLine, badCol);
|
||||
RETURN_CLEANUP(wellFormedOutsideDtd);
|
||||
}
|
||||
RETURN_CLEANUP(wellFormed);
|
||||
case XML_TOK_PROLOG_S:
|
||||
case XML_TOK_COMMENT:
|
||||
|
@ -138,11 +143,13 @@ wfCheck(const char *s, size_t n,
|
|||
|
||||
static
|
||||
int skipProlog(const char **startp, const char *end,
|
||||
const char **nextTokP, const ENCODING **enc)
|
||||
const char **nextTokP, const ENCODING **enc,
|
||||
const char **doctypeP)
|
||||
{
|
||||
const char *s = *startp;
|
||||
INIT_ENCODING initEnc;
|
||||
XmlInitEncoding(&initEnc, enc);
|
||||
*doctypeP = 0;
|
||||
for (;;) {
|
||||
int tok = XmlPrologTok(*enc, s, end, nextTokP);
|
||||
switch (tok) {
|
||||
|
@ -155,7 +162,25 @@ int skipProlog(const char **startp, const char *end,
|
|||
case XML_TOK_PARTIAL:
|
||||
*startp = s;
|
||||
return tok;
|
||||
case XML_TOK_DECL_OPEN:
|
||||
if (!*doctypeP) {
|
||||
if (XmlNameMatchesAscii(*enc, s + 2 * (*enc)->minBytesPerChar, "DOCTYPE"))
|
||||
*doctypeP = s;
|
||||
else {
|
||||
*startp = s;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case XML_TOK_PROLOG_S:
|
||||
case XML_TOK_LITERAL:
|
||||
case XML_TOK_COMMENT:
|
||||
break;
|
||||
default:
|
||||
if (!*doctypeP) {
|
||||
*startp = s;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
break;
|
||||
}
|
||||
s = *nextTokP;
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
enum WfCheckResult {
|
||||
wellFormed,
|
||||
wellFormedOutsideDtd,
|
||||
noMemory,
|
||||
noElements,
|
||||
invalidToken,
|
||||
|
|
|
@ -51,6 +51,7 @@ int doFile(const char *name)
|
|||
if (result) {
|
||||
static const char *message[] = {
|
||||
0,
|
||||
"DOCTYPE declaration ignored",
|
||||
"out of memory",
|
||||
"no element found",
|
||||
"invalid token",
|
||||
|
@ -63,7 +64,7 @@ int doFile(const char *name)
|
|||
fprintf(stderr, "%s:", name);
|
||||
if (badPtr != 0)
|
||||
fprintf(stderr, "%lu:%lu:", badLine+1, badCol);
|
||||
fprintf(stderr, "E: %s", message[result]);
|
||||
fprintf(stderr, "%c: %s", (result == wellFormedOutsideDtd ? 'W' : 'E'), message[result]);
|
||||
putc('\n', stderr);
|
||||
ret = 1;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue