From 9501138aae44531844959be873943fb088b40d9e Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 13 Nov 1997 09:05:46 +0000 Subject: [PATCH] Realistic prolog tokenization. --- expat/xmltok/asciitab.h | 8 +- expat/xmltok/xmltok.c | 21 +-- expat/xmltok/xmltok.h | 25 +++- expat/xmltok/xmltok_impl.c | 271 ++++++++++++++++++++++++++++++++----- expat/xmltok/xmltok_impl.h | 9 +- expat/xmlwf/wfcheck.c | 31 ++++- expat/xmlwf/wfcheck.h | 1 + expat/xmlwf/xmlwf.c | 3 +- 8 files changed, 307 insertions(+), 62 deletions(-) diff --git a/expat/xmltok/asciitab.h b/expat/xmltok/asciitab.h index eec36387..8a5c239d 100755 --- a/expat/xmltok/asciitab.h +++ b/expat/xmltok/asciitab.h @@ -7,9 +7,9 @@ /* 0x18 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML, /* 0x1C */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML, /* 0x20 */ BT_S, BT_EXCL, BT_QUOT, BT_NUM, -/* 0x24 */ BT_OTHER, BT_OTHER, BT_AMP, BT_APOS, -/* 0x28 */ BT_OTHER, BT_OTHER, BT_OTHER, BT_OTHER, -/* 0x2C */ BT_OTHER, BT_MINUS, BT_NAME, BT_SOL, +/* 0x24 */ BT_OTHER, BT_PERCNT, BT_AMP, BT_APOS, +/* 0x28 */ BT_LPAR, BT_RPAR, BT_AST, BT_PLUS, +/* 0x2C */ BT_COMMA, BT_MINUS, BT_NAME, BT_SOL, /* 0x30 */ BT_DIGIT, BT_DIGIT, BT_DIGIT, BT_DIGIT, /* 0x34 */ BT_DIGIT, BT_DIGIT, BT_DIGIT, BT_DIGIT, /* 0x38 */ BT_DIGIT, BT_DIGIT, BT_NMSTRT, BT_SEMI, @@ -29,4 +29,4 @@ /* 0x70 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, /* 0x74 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, /* 0x78 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_OTHER, -/* 0x7C */ BT_OTHER, BT_OTHER, BT_OTHER, BT_OTHER, +/* 0x7C */ BT_VERBAR, BT_OTHER, BT_OTHER, BT_OTHER, diff --git a/expat/xmltok/xmltok.c b/expat/xmltok/xmltok.c index ce835a24..e846cdfc 100755 --- a/expat/xmltok/xmltok.c +++ b/expat/xmltok/xmltok.c @@ -1,18 +1,3 @@ -/* TODO - -Provide method to get name length. - -Provide methods to convert to any of UTF-8, UTF-18, UCS-4. - -Tokenize prologs in a way useful for well-formedness checking - - */ +#define XML_TOK_NAME 18 +#define XML_TOK_NMTOKEN 19 +#define XML_TOK_POUND_NAME 20 /* #name */ +#define XML_TOK_COMMA 21 +#define XML_TOK_OR 22 /* | */ +#define XML_TOK_PERCENT 23 +#define XML_TOK_OPEN_PAREN 24 +#define XML_TOK_CLOSE_PAREN 25 +#define XML_TOK_OPEN_BRACKET 26 +#define XML_TOK_CLOSE_BRACKET 27 +#define XML_TOK_CLOSE_PAREN_QUESTION 28 /* )? */ +#define XML_TOK_CLOSE_PAREN_ASTERISK 29 /* )* */ +#define XML_TOK_CLOSE_PAREN_PLUS 30 /* )+ */ +#define XML_TOK_NAME_QUESTION 31 /* name? */ +#define XML_TOK_NAME_ASTERISK 32 /* name* */ +#define XML_TOK_NAME_PLUS 33 /* name+ */ +#define XML_TOK_COND_SECT_OPEN 34 /* */ #define XML_NSTATES 2 #define XML_PROLOG_STATE 0 @@ -61,6 +81,8 @@ typedef struct encoding { const char **); int (*sameName)(const struct encoding *, const char *, const char *); + int (*nameMatchesAscii)(const struct encoding *, + const char *, const char *); int (*getAtts)(const struct encoding *enc, const char *ptr, int attsMax, const char **atts); void (*updatePosition)(const struct encoding *, @@ -101,6 +123,7 @@ literals, comments and processing instructions. XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr) #define XmlSameName(enc, ptr1, ptr2) (((enc)->sameName)(enc, ptr1, ptr2)) +#define XmlNameMatchesAscii(enc, ptr1, ptr2) (((enc)->nameMatchesAscii)(enc, ptr1, ptr2)) #define XmlGetAttributes(enc, ptr, attsMax, atts) \ (((enc)->getAtts)(enc, ptr, attsMax, atts)) diff --git a/expat/xmltok/xmltok_impl.c b/expat/xmltok/xmltok_impl.c index be241e41..f49553fd 100755 --- a/expat/xmltok/xmltok_impl.c +++ b/expat/xmltok/xmltok_impl.c @@ -125,22 +125,43 @@ static int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { - if (ptr != end) { - if (BYTE_TYPE(enc, ptr) == BT_MINUS) - return PREFIX(scanComment)(enc, ptr + MINBPC, end, nextTokPtr); - do { - switch (BYTE_TYPE(enc, ptr)) { - MULTIBYTE_CASES(ptr, end, (*nextTokPtr = ptr, XML_TOK_PROLOG_CHARS)) - INVALID_CASES(ptr, nextTokPtr) - case BT_APOS: - case BT_QUOT: - case BT_LT: - *nextTokPtr = ptr; - return XML_TOK_PROLOG_CHARS; - } - } while ((ptr += MINBPC) != end); + if (ptr == end) + return XML_TOK_PARTIAL; + if (CHAR_MATCHES(enc, ptr, '-')) + return PREFIX(scanComment)(enc, ptr + MINBPC, end, nextTokPtr); + /* FIXME check for */ + switch (BYTE_TYPE(enc, ptr + MINBPC)) { + case BT_S: case BT_CR: case BT_LF: case BT_PERCNT: + *nextTokPtr = ptr; + return XML_TOK_INVALID; + } + /* fall through */ + case BT_S: case BT_CR: case BT_LF: + *nextTokPtr = ptr; + return XML_TOK_DECL_OPEN; + case BT_NMSTRT: + case BT_HEX: + ptr += MINBPC; + break; + default: + *nextTokPtr = ptr; + return XML_TOK_INVALID; + } } return XML_TOK_PARTIAL; } @@ -645,10 +666,70 @@ int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, return XML_TOK_DATA_CHARS; } +/* ptr points to character following "%" */ + +static +int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, + const char **nextTokPtr) +{ + if (ptr == end) + return XML_TOK_PARTIAL; + switch (BYTE_TYPE(enc, ptr)) { + CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr) + case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: + *nextTokPtr = ptr; + return XML_TOK_PERCENT; + default: + *nextTokPtr = ptr; + return XML_TOK_INVALID; + } + while (ptr != end) { + switch (BYTE_TYPE(enc, ptr)) { + CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) + case BT_SEMI: + *nextTokPtr = ptr + MINBPC; + return XML_TOK_PARAM_ENTITY_REF; + default: + *nextTokPtr = ptr; + return XML_TOK_INVALID; + } + } + return XML_TOK_PARTIAL; +} + +static +int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end, + const char **nextTokPtr) +{ + if (ptr == end) + return XML_TOK_PARTIAL; + switch (BYTE_TYPE(enc, ptr)) { + CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr) + default: + *nextTokPtr = ptr; + return XML_TOK_INVALID; + } + while (ptr != end) { + switch (BYTE_TYPE(enc, ptr)) { + CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) + case BT_CR: case BT_LF: case BT_S: + case BT_RPAR: case BT_GT: case BT_PERCNT: + *nextTokPtr = ptr; + return XML_TOK_POUND_NAME; + default: + *nextTokPtr = ptr; + return XML_TOK_INVALID; + } + } + return XML_TOK_PARTIAL; +} + + static int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { + int tok; if (ptr == end) return XML_TOK_NONE; #if MINBPC > 1 @@ -663,10 +744,9 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, } #endif switch (BYTE_TYPE(enc, ptr)) { - INVALID_CASES(ptr, nextTokPtr) - MULTIBYTE_CASES(ptr, end, XML_TOK_PARTIAL_CHAR) case BT_QUOT: { + /* FIXME multibyte, plus require S, > or % afterwards */ for (ptr += MINBPC; ptr != end; ptr += MINBPC) { if (BYTE_TYPE(enc, ptr) == BT_QUOT) { *nextTokPtr = ptr + MINBPC; @@ -677,6 +757,7 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, } case BT_APOS: { + /* FIXME multibyte, plus require S, > or % afterwards */ for (ptr += MINBPC; ptr != end; ptr += MINBPC) { if (BYTE_TYPE(enc, ptr) == BT_APOS) { *nextTokPtr = ptr + MINBPC; @@ -722,29 +803,126 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, } *nextTokPtr = ptr; return XML_TOK_PROLOG_S; - default: + case BT_PERCNT: + return PREFIX(scanPercent)(enc, ptr + MINBPC, end, nextTokPtr); + case BT_COMMA: + *nextTokPtr = ptr + MINBPC; + return XML_TOK_COMMA; + case BT_LSQB: + *nextTokPtr = ptr + MINBPC; + return XML_TOK_OPEN_BRACKET; + case BT_RSQB: + /* FIXME check for ]]> */ + *nextTokPtr = ptr + MINBPC; + return XML_TOK_CLOSE_BRACKET; + case BT_LPAR: + *nextTokPtr = ptr + MINBPC; + return XML_TOK_OPEN_PAREN; + case BT_RPAR: + ptr += MINBPC; + if (ptr == end) + return XML_TOK_INVALID; + switch (BYTE_TYPE(enc, ptr)) { + case BT_AST: + *nextTokPtr = ptr + MINBPC; + return XML_TOK_CLOSE_PAREN_ASTERISK; + case BT_QUEST: + *nextTokPtr = ptr + MINBPC; + return XML_TOK_CLOSE_PAREN_QUESTION; + case BT_PLUS: + *nextTokPtr = ptr + MINBPC; + return XML_TOK_CLOSE_PAREN_PLUS; + } + *nextTokPtr = ptr; + return XML_TOK_CLOSE_PAREN; + case BT_VERBAR: + *nextTokPtr = ptr + MINBPC; + return XML_TOK_OR; + case BT_GT: + *nextTokPtr = ptr + MINBPC; + return XML_TOK_DECL_CLOSE; + case BT_NUM: + return PREFIX(scanPoundName)(enc, ptr + MINBPC, end, nextTokPtr); +#define LEAD_CASE(n) \ + case BT_LEAD ## n: \ + if (end - ptr < n) \ + return XML_TOK_PARTIAL_CHAR; \ + if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ + ptr += n; \ + tok = XML_TOK_NAME; \ + break; \ + } \ + if (IS_NAME_CHAR(enc, ptr, n)) { \ + ptr += n; \ + tok = XML_TOK_NMTOKEN; \ + break; \ + } \ + *nextTokPtr = ptr; \ + return XML_TOK_INVALID; + LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) LEAD_CASE(5) LEAD_CASE(6) +#undef LEAD_CASE + case BT_NMSTRT: + case BT_HEX: + tok = XML_TOK_NAME; ptr += MINBPC; break; - } - for (; ptr != end;) { - switch (BYTE_TYPE(enc, ptr)) { - case BT_LT: - case BT_QUOT: - case BT_APOS: - case BT_NONXML: - case BT_MALFORM: - case BT_TRAIL: - case BT_S: case BT_CR: case BT_LF: - *nextTokPtr = ptr; - return XML_TOK_PROLOG_CHARS; - MULTIBYTE_CASES(ptr, end, (*nextTokPtr = ptr, XML_TOK_PROLOG_CHARS)) - default: + case BT_DIGIT: + case BT_NAME: + case BT_MINUS: + tok = XML_TOK_NMTOKEN; + ptr += MINBPC; + break; + case BT_NONASCII: + if (IS_NMSTRT_CHAR(enc, ptr, MINBPC)) { ptr += MINBPC; + tok = XML_TOK_NAME; break; } + if (IS_NAME_CHAR(enc, ptr, MINBPC)) { + ptr += MINBPC; + tok = XML_TOK_NMTOKEN; + break; + } + /* fall through */ + default: + *nextTokPtr = ptr; + return XML_TOK_INVALID; } - *nextTokPtr = ptr; - return XML_TOK_PROLOG_CHARS; + while (ptr != end) { + switch (BYTE_TYPE(enc, ptr)) { + CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) + case BT_GT: case BT_RPAR: case BT_COMMA: + case BT_VERBAR: case BT_LSQB: case BT_PERCNT: + case BT_S: case BT_CR: case BT_LF: + *nextTokPtr = ptr; + return tok; + case BT_PLUS: + if (tok != XML_TOK_NAME) { + *nextTokPtr = ptr; + return XML_TOK_INVALID; + } + *nextTokPtr = ptr + MINBPC; + return XML_TOK_NAME_PLUS; + case BT_AST: + if (tok != XML_TOK_NAME) { + *nextTokPtr = ptr; + return XML_TOK_INVALID; + } + *nextTokPtr = ptr + MINBPC; + return XML_TOK_NAME_ASTERISK; + case BT_QUEST: + if (tok != XML_TOK_NAME) { + *nextTokPtr = ptr; + return XML_TOK_INVALID; + } + *nextTokPtr = ptr + MINBPC; + return XML_TOK_NAME_QUESTION; + default: + *nextTokPtr = ptr; + return XML_TOK_INVALID; + } + } + return XML_TOK_PARTIAL; } /* This must only be called for a well-formed start-tag or empty element tag. @@ -861,6 +1039,31 @@ int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2) /* not reached */ } +static +int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, const char *ptr2) +{ + for (; *ptr2; ptr1 += MINBPC, ptr2++) { + if (!CHAR_MATCHES(end, ptr1, *ptr2)) + return 0; + } + switch (BYTE_TYPE(enc, ptr1)) { + case BT_LEAD2: + case BT_LEAD3: + case BT_LEAD4: + case BT_LEAD5: + case BT_LEAD6: + case BT_NONASCII: + case BT_NMSTRT: + case BT_HEX: + case BT_DIGIT: + case BT_NAME: + case BT_MINUS: + return 0; + default: + return 1; + } +} + static void PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, diff --git a/expat/xmltok/xmltok_impl.h b/expat/xmltok/xmltok_impl.h index ffad0279..e3d6dcfd 100755 --- a/expat/xmltok/xmltok_impl.h +++ b/expat/xmltok/xmltok_impl.h @@ -29,7 +29,14 @@ enum { BT_NAME, BT_MINUS, BT_OTHER, /* known not to be a name or name start character */ - BT_NONASCII /* might be a name or name start character */ + BT_NONASCII, /* might be a name or name start character */ + BT_PERCNT, + BT_LPAR, + BT_RPAR, + BT_AST, + BT_PLUS, + BT_COMMA, + BT_VERBAR }; #include diff --git a/expat/xmlwf/wfcheck.c b/expat/xmlwf/wfcheck.c index 5eaedfd4..254c5db3 100755 --- a/expat/xmlwf/wfcheck.c +++ b/expat/xmlwf/wfcheck.c @@ -10,7 +10,7 @@ static int skipProlog(const char **s, const char *end, const char **nextTokP, - const ENCODING **enc); + const ENCODING **enc, const char **doctypeP); static void setPosition(const ENCODING *enc, const char *start, const char *end, @@ -26,6 +26,7 @@ wfCheck(const char *s, size_t n, const char *end = s + n; const char *next; const ENCODING *enc; + const char *doctype = 0; size_t stackSize = 1024; size_t level = 0; int tok; @@ -35,7 +36,7 @@ wfCheck(const char *s, size_t n, #define RETURN_CLEANUP(n) return (free((void *)startName), free((void *)atts), (n)) if (!startName) return noMemory; - tok = skipProlog(&s, end, &next, &enc); + tok = skipProlog(&s, end, &next, &enc, &doctype); for (;;) { switch (tok) { case XML_TOK_NONE: @@ -114,6 +115,10 @@ wfCheck(const char *s, size_t n, tok = XmlPrologTok(enc, s, end, &next); switch (tok) { case XML_TOK_NONE: + if (doctype) { + setPosition(enc, start, doctype, badPtr, badLine, badCol); + RETURN_CLEANUP(wellFormedOutsideDtd); + } RETURN_CLEANUP(wellFormed); case XML_TOK_PROLOG_S: case XML_TOK_COMMENT: @@ -138,11 +143,13 @@ wfCheck(const char *s, size_t n, static int skipProlog(const char **startp, const char *end, - const char **nextTokP, const ENCODING **enc) + const char **nextTokP, const ENCODING **enc, + const char **doctypeP) { const char *s = *startp; INIT_ENCODING initEnc; XmlInitEncoding(&initEnc, enc); + *doctypeP = 0; for (;;) { int tok = XmlPrologTok(*enc, s, end, nextTokP); switch (tok) { @@ -155,7 +162,25 @@ int skipProlog(const char **startp, const char *end, case XML_TOK_PARTIAL: *startp = s; return tok; + case XML_TOK_DECL_OPEN: + if (!*doctypeP) { + if (XmlNameMatchesAscii(*enc, s + 2 * (*enc)->minBytesPerChar, "DOCTYPE")) + *doctypeP = s; + else { + *startp = s; + return XML_TOK_INVALID; + } + } + break; + case XML_TOK_PROLOG_S: + case XML_TOK_LITERAL: + case XML_TOK_COMMENT: + break; default: + if (!*doctypeP) { + *startp = s; + return XML_TOK_INVALID; + } break; } s = *nextTokP; diff --git a/expat/xmlwf/wfcheck.h b/expat/xmlwf/wfcheck.h index 7b410455..96f2c593 100755 --- a/expat/xmlwf/wfcheck.h +++ b/expat/xmlwf/wfcheck.h @@ -3,6 +3,7 @@ enum WfCheckResult { wellFormed, + wellFormedOutsideDtd, noMemory, noElements, invalidToken, diff --git a/expat/xmlwf/xmlwf.c b/expat/xmlwf/xmlwf.c index 9723dbda..1f73ce94 100755 --- a/expat/xmlwf/xmlwf.c +++ b/expat/xmlwf/xmlwf.c @@ -51,6 +51,7 @@ int doFile(const char *name) if (result) { static const char *message[] = { 0, + "DOCTYPE declaration ignored", "out of memory", "no element found", "invalid token", @@ -63,7 +64,7 @@ int doFile(const char *name) fprintf(stderr, "%s:", name); if (badPtr != 0) fprintf(stderr, "%lu:%lu:", badLine+1, badCol); - fprintf(stderr, "E: %s", message[result]); + fprintf(stderr, "%c: %s", (result == wellFormedOutsideDtd ? 'W' : 'E'), message[result]); putc('\n', stderr); ret = 1; }