Realistic prolog tokenization.

This commit is contained in:
James Clark 1997-11-13 09:05:46 +00:00
parent 7fc8cdf625
commit 9501138aae
8 changed files with 307 additions and 62 deletions

View file

@ -7,9 +7,9 @@
/* 0x18 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x1C */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x20 */ BT_S, BT_EXCL, BT_QUOT, BT_NUM,
/* 0x24 */ BT_OTHER, BT_OTHER, BT_AMP, BT_APOS,
/* 0x28 */ BT_OTHER, BT_OTHER, BT_OTHER, BT_OTHER,
/* 0x2C */ BT_OTHER, BT_MINUS, BT_NAME, BT_SOL,
/* 0x24 */ BT_OTHER, BT_PERCNT, BT_AMP, BT_APOS,
/* 0x28 */ BT_LPAR, BT_RPAR, BT_AST, BT_PLUS,
/* 0x2C */ BT_COMMA, BT_MINUS, BT_NAME, BT_SOL,
/* 0x30 */ BT_DIGIT, BT_DIGIT, BT_DIGIT, BT_DIGIT,
/* 0x34 */ BT_DIGIT, BT_DIGIT, BT_DIGIT, BT_DIGIT,
/* 0x38 */ BT_DIGIT, BT_DIGIT, BT_NMSTRT, BT_SEMI,
@ -29,4 +29,4 @@
/* 0x70 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
/* 0x74 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
/* 0x78 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_OTHER,
/* 0x7C */ BT_OTHER, BT_OTHER, BT_OTHER, BT_OTHER,
/* 0x7C */ BT_VERBAR, BT_OTHER, BT_OTHER, BT_OTHER,

View file

@ -1,18 +1,3 @@
/* TODO
Provide method to get name length.
Provide methods to convert to any of UTF-8, UTF-18, UCS-4.
Tokenize prologs in a way useful for well-formedness checking
<!NAME
NMTOKEN
NAME
PEREF
*/
#ifdef _MSC_VER
#define XMLTOKAPI __declspec(dllexport)
#endif
@ -80,7 +65,7 @@ struct normal_encoding {
#undef IS_NMSTRT_CHAR
const struct normal_encoding utf8_encoding = {
{ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 1 },
{ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii), PREFIX(getAtts), PREFIX(updatePosition), 1 },
#include "asciitab.h"
#include "utf8tab.h"
};
@ -129,7 +114,7 @@ static int unicode_byte_type(char hi, char lo)
#undef IS_NMSTRT_CHAR
const struct encoding little2_encoding = {
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 2
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii), PREFIX(getAtts), PREFIX(updatePosition), 2
};
#undef PREFIX
@ -154,7 +139,7 @@ const struct encoding little2_encoding = {
#undef IS_NMSTRT_CHAR
const struct encoding big2_encoding = {
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 2
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii), PREFIX(getAtts), PREFIX(updatePosition), 2
};
#undef PREFIX

View file

@ -38,8 +38,28 @@ of the prolog and is also returned by XmlContentTok */
/* The following tokens are returned only by XmlPrologTok */
#define XML_TOK_LITERAL 13
#define XML_TOK_PROLOG_CHARS 14
#define XML_TOK_PARAM_ENTITY_REF 14
#define XML_TOK_PROLOG_S 15
#define XML_TOK_DECL_OPEN 16 /* <!foo */
#define XML_TOK_DECL_CLOSE 17 /* > */
#define XML_TOK_NAME 18
#define XML_TOK_NMTOKEN 19
#define XML_TOK_POUND_NAME 20 /* #name */
#define XML_TOK_COMMA 21
#define XML_TOK_OR 22 /* | */
#define XML_TOK_PERCENT 23
#define XML_TOK_OPEN_PAREN 24
#define XML_TOK_CLOSE_PAREN 25
#define XML_TOK_OPEN_BRACKET 26
#define XML_TOK_CLOSE_BRACKET 27
#define XML_TOK_CLOSE_PAREN_QUESTION 28 /* )? */
#define XML_TOK_CLOSE_PAREN_ASTERISK 29 /* )* */
#define XML_TOK_CLOSE_PAREN_PLUS 30 /* )+ */
#define XML_TOK_NAME_QUESTION 31 /* name? */
#define XML_TOK_NAME_ASTERISK 32 /* name* */
#define XML_TOK_NAME_PLUS 33 /* name+ */
#define XML_TOK_COND_SECT_OPEN 34 /* <![ */
#define XML_TOK_COND_SECT_CLOSE 35 /* ]]> */
#define XML_NSTATES 2
#define XML_PROLOG_STATE 0
@ -61,6 +81,8 @@ typedef struct encoding {
const char **);
int (*sameName)(const struct encoding *,
const char *, const char *);
int (*nameMatchesAscii)(const struct encoding *,
const char *, const char *);
int (*getAtts)(const struct encoding *enc, const char *ptr,
int attsMax, const char **atts);
void (*updatePosition)(const struct encoding *,
@ -101,6 +123,7 @@ literals, comments and processing instructions.
XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
#define XmlSameName(enc, ptr1, ptr2) (((enc)->sameName)(enc, ptr1, ptr2))
#define XmlNameMatchesAscii(enc, ptr1, ptr2) (((enc)->nameMatchesAscii)(enc, ptr1, ptr2))
#define XmlGetAttributes(enc, ptr, attsMax, atts) \
(((enc)->getAtts)(enc, ptr, attsMax, atts))

View file

@ -125,22 +125,43 @@ static
int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
const char **nextTokPtr)
{
if (ptr != end) {
if (BYTE_TYPE(enc, ptr) == BT_MINUS)
return PREFIX(scanComment)(enc, ptr + MINBPC, end, nextTokPtr);
do {
switch (BYTE_TYPE(enc, ptr)) {
MULTIBYTE_CASES(ptr, end, (*nextTokPtr = ptr, XML_TOK_PROLOG_CHARS))
INVALID_CASES(ptr, nextTokPtr)
case BT_APOS:
case BT_QUOT:
case BT_LT:
*nextTokPtr = ptr;
return XML_TOK_PROLOG_CHARS;
}
} while ((ptr += MINBPC) != end);
if (ptr == end)
return XML_TOK_PARTIAL;
if (CHAR_MATCHES(enc, ptr, '-'))
return PREFIX(scanComment)(enc, ptr + MINBPC, end, nextTokPtr);
/* FIXME check for <![ */
switch (BYTE_TYPE(enc, ptr)) {
case BT_NMSTRT:
case BT_HEX:
ptr += MINBPC;
break;
default:
*nextTokPtr = ptr;
return XML_TOK_PROLOG_CHARS;
return XML_TOK_INVALID;
}
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
case BT_PERCNT:
if (ptr + MINBPC == end)
return XML_TOK_PARTIAL;
/* don't allow <!ENTITY% foo "whatever"> */
switch (BYTE_TYPE(enc, ptr + MINBPC)) {
case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
/* fall through */
case BT_S: case BT_CR: case BT_LF:
*nextTokPtr = ptr;
return XML_TOK_DECL_OPEN;
case BT_NMSTRT:
case BT_HEX:
ptr += MINBPC;
break;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
}
return XML_TOK_PARTIAL;
}
@ -645,10 +666,70 @@ int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
return XML_TOK_DATA_CHARS;
}
/* ptr points to character following "%" */
static
int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
const char **nextTokPtr)
{
if (ptr == end)
return XML_TOK_PARTIAL;
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr)
case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
*nextTokPtr = ptr;
return XML_TOK_PERCENT;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
case BT_SEMI:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_PARAM_ENTITY_REF;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
}
return XML_TOK_PARTIAL;
}
static
int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
const char **nextTokPtr)
{
if (ptr == end)
return XML_TOK_PARTIAL;
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr)
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
case BT_CR: case BT_LF: case BT_S:
case BT_RPAR: case BT_GT: case BT_PERCNT:
*nextTokPtr = ptr;
return XML_TOK_POUND_NAME;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
}
return XML_TOK_PARTIAL;
}
static
int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
const char **nextTokPtr)
{
int tok;
if (ptr == end)
return XML_TOK_NONE;
#if MINBPC > 1
@ -663,10 +744,9 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
}
#endif
switch (BYTE_TYPE(enc, ptr)) {
INVALID_CASES(ptr, nextTokPtr)
MULTIBYTE_CASES(ptr, end, XML_TOK_PARTIAL_CHAR)
case BT_QUOT:
{
/* FIXME multibyte, plus require S, > or % afterwards */
for (ptr += MINBPC; ptr != end; ptr += MINBPC) {
if (BYTE_TYPE(enc, ptr) == BT_QUOT) {
*nextTokPtr = ptr + MINBPC;
@ -677,6 +757,7 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
}
case BT_APOS:
{
/* FIXME multibyte, plus require S, > or % afterwards */
for (ptr += MINBPC; ptr != end; ptr += MINBPC) {
if (BYTE_TYPE(enc, ptr) == BT_APOS) {
*nextTokPtr = ptr + MINBPC;
@ -722,29 +803,126 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
}
*nextTokPtr = ptr;
return XML_TOK_PROLOG_S;
default:
case BT_PERCNT:
return PREFIX(scanPercent)(enc, ptr + MINBPC, end, nextTokPtr);
case BT_COMMA:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_COMMA;
case BT_LSQB:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_OPEN_BRACKET;
case BT_RSQB:
/* FIXME check for ]]> */
*nextTokPtr = ptr + MINBPC;
return XML_TOK_CLOSE_BRACKET;
case BT_LPAR:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_OPEN_PAREN;
case BT_RPAR:
ptr += MINBPC;
if (ptr == end)
return XML_TOK_INVALID;
switch (BYTE_TYPE(enc, ptr)) {
case BT_AST:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_CLOSE_PAREN_ASTERISK;
case BT_QUEST:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_CLOSE_PAREN_QUESTION;
case BT_PLUS:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_CLOSE_PAREN_PLUS;
}
*nextTokPtr = ptr;
return XML_TOK_CLOSE_PAREN;
case BT_VERBAR:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_OR;
case BT_GT:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_DECL_CLOSE;
case BT_NUM:
return PREFIX(scanPoundName)(enc, ptr + MINBPC, end, nextTokPtr);
#define LEAD_CASE(n) \
case BT_LEAD ## n: \
if (end - ptr < n) \
return XML_TOK_PARTIAL_CHAR; \
if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
ptr += n; \
tok = XML_TOK_NAME; \
break; \
} \
if (IS_NAME_CHAR(enc, ptr, n)) { \
ptr += n; \
tok = XML_TOK_NMTOKEN; \
break; \
} \
*nextTokPtr = ptr; \
return XML_TOK_INVALID;
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) LEAD_CASE(5) LEAD_CASE(6)
#undef LEAD_CASE
case BT_NMSTRT:
case BT_HEX:
tok = XML_TOK_NAME;
ptr += MINBPC;
break;
}
for (; ptr != end;) {
switch (BYTE_TYPE(enc, ptr)) {
case BT_LT:
case BT_QUOT:
case BT_APOS:
case BT_NONXML:
case BT_MALFORM:
case BT_TRAIL:
case BT_S: case BT_CR: case BT_LF:
*nextTokPtr = ptr;
return XML_TOK_PROLOG_CHARS;
MULTIBYTE_CASES(ptr, end, (*nextTokPtr = ptr, XML_TOK_PROLOG_CHARS))
default:
case BT_DIGIT:
case BT_NAME:
case BT_MINUS:
tok = XML_TOK_NMTOKEN;
ptr += MINBPC;
break;
case BT_NONASCII:
if (IS_NMSTRT_CHAR(enc, ptr, MINBPC)) {
ptr += MINBPC;
tok = XML_TOK_NAME;
break;
}
if (IS_NAME_CHAR(enc, ptr, MINBPC)) {
ptr += MINBPC;
tok = XML_TOK_NMTOKEN;
break;
}
/* fall through */
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
*nextTokPtr = ptr;
return XML_TOK_PROLOG_CHARS;
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
case BT_GT: case BT_RPAR: case BT_COMMA:
case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
case BT_S: case BT_CR: case BT_LF:
*nextTokPtr = ptr;
return tok;
case BT_PLUS:
if (tok != XML_TOK_NAME) {
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
*nextTokPtr = ptr + MINBPC;
return XML_TOK_NAME_PLUS;
case BT_AST:
if (tok != XML_TOK_NAME) {
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
*nextTokPtr = ptr + MINBPC;
return XML_TOK_NAME_ASTERISK;
case BT_QUEST:
if (tok != XML_TOK_NAME) {
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
*nextTokPtr = ptr + MINBPC;
return XML_TOK_NAME_QUESTION;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
}
return XML_TOK_PARTIAL;
}
/* This must only be called for a well-formed start-tag or empty element tag.
@ -861,6 +1039,31 @@ int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
/* not reached */
}
static
int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, const char *ptr2)
{
for (; *ptr2; ptr1 += MINBPC, ptr2++) {
if (!CHAR_MATCHES(end, ptr1, *ptr2))
return 0;
}
switch (BYTE_TYPE(enc, ptr1)) {
case BT_LEAD2:
case BT_LEAD3:
case BT_LEAD4:
case BT_LEAD5:
case BT_LEAD6:
case BT_NONASCII:
case BT_NMSTRT:
case BT_HEX:
case BT_DIGIT:
case BT_NAME:
case BT_MINUS:
return 0;
default:
return 1;
}
}
static
void PREFIX(updatePosition)(const ENCODING *enc,
const char *ptr,

View file

@ -29,7 +29,14 @@ enum {
BT_NAME,
BT_MINUS,
BT_OTHER, /* known not to be a name or name start character */
BT_NONASCII /* might be a name or name start character */
BT_NONASCII, /* might be a name or name start character */
BT_PERCNT,
BT_LPAR,
BT_RPAR,
BT_AST,
BT_PLUS,
BT_COMMA,
BT_VERBAR
};
#include <stddef.h>

View file

@ -10,7 +10,7 @@
static
int skipProlog(const char **s, const char *end, const char **nextTokP,
const ENCODING **enc);
const ENCODING **enc, const char **doctypeP);
static
void setPosition(const ENCODING *enc,
const char *start, const char *end,
@ -26,6 +26,7 @@ wfCheck(const char *s, size_t n,
const char *end = s + n;
const char *next;
const ENCODING *enc;
const char *doctype = 0;
size_t stackSize = 1024;
size_t level = 0;
int tok;
@ -35,7 +36,7 @@ wfCheck(const char *s, size_t n,
#define RETURN_CLEANUP(n) return (free((void *)startName), free((void *)atts), (n))
if (!startName)
return noMemory;
tok = skipProlog(&s, end, &next, &enc);
tok = skipProlog(&s, end, &next, &enc, &doctype);
for (;;) {
switch (tok) {
case XML_TOK_NONE:
@ -114,6 +115,10 @@ wfCheck(const char *s, size_t n,
tok = XmlPrologTok(enc, s, end, &next);
switch (tok) {
case XML_TOK_NONE:
if (doctype) {
setPosition(enc, start, doctype, badPtr, badLine, badCol);
RETURN_CLEANUP(wellFormedOutsideDtd);
}
RETURN_CLEANUP(wellFormed);
case XML_TOK_PROLOG_S:
case XML_TOK_COMMENT:
@ -138,11 +143,13 @@ wfCheck(const char *s, size_t n,
static
int skipProlog(const char **startp, const char *end,
const char **nextTokP, const ENCODING **enc)
const char **nextTokP, const ENCODING **enc,
const char **doctypeP)
{
const char *s = *startp;
INIT_ENCODING initEnc;
XmlInitEncoding(&initEnc, enc);
*doctypeP = 0;
for (;;) {
int tok = XmlPrologTok(*enc, s, end, nextTokP);
switch (tok) {
@ -155,7 +162,25 @@ int skipProlog(const char **startp, const char *end,
case XML_TOK_PARTIAL:
*startp = s;
return tok;
case XML_TOK_DECL_OPEN:
if (!*doctypeP) {
if (XmlNameMatchesAscii(*enc, s + 2 * (*enc)->minBytesPerChar, "DOCTYPE"))
*doctypeP = s;
else {
*startp = s;
return XML_TOK_INVALID;
}
}
break;
case XML_TOK_PROLOG_S:
case XML_TOK_LITERAL:
case XML_TOK_COMMENT:
break;
default:
if (!*doctypeP) {
*startp = s;
return XML_TOK_INVALID;
}
break;
}
s = *nextTokP;

View file

@ -3,6 +3,7 @@
enum WfCheckResult {
wellFormed,
wellFormedOutsideDtd,
noMemory,
noElements,
invalidToken,

View file

@ -51,6 +51,7 @@ int doFile(const char *name)
if (result) {
static const char *message[] = {
0,
"DOCTYPE declaration ignored",
"out of memory",
"no element found",
"invalid token",
@ -63,7 +64,7 @@ int doFile(const char *name)
fprintf(stderr, "%s:", name);
if (badPtr != 0)
fprintf(stderr, "%lu:%lu:", badLine+1, badCol);
fprintf(stderr, "E: %s", message[result]);
fprintf(stderr, "%c: %s", (result == wellFormedOutsideDtd ? 'W' : 'E'), message[result]);
putc('\n', stderr);
ret = 1;
}