mirror of
https://github.com/libexpat/libexpat.git
synced 2025-04-05 05:05:00 +00:00
Support for line and column numbers
This commit is contained in:
parent
84be77b536
commit
9651443ca7
8 changed files with 161 additions and 41 deletions
|
@ -1,7 +1,7 @@
|
|||
/* 0x00 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
|
||||
/* 0x04 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
|
||||
/* 0x08 */ BT_NONXML, BT_S, BT_S, BT_NONXML,
|
||||
/* 0x0C */ BT_NONXML, BT_S, BT_NONXML, BT_NONXML,
|
||||
/* 0x08 */ BT_NONXML, BT_S, BT_LF, BT_NONXML,
|
||||
/* 0x0C */ BT_NONXML, BT_CR, BT_NONXML, BT_NONXML,
|
||||
/* 0x10 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
|
||||
/* 0x14 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
|
||||
/* 0x18 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
|
||||
|
|
|
@ -2,8 +2,6 @@
|
|||
|
||||
Provide method to get name length.
|
||||
|
||||
Provide method to count lines/columns.
|
||||
|
||||
Provide methods to convert to any of UTF-8, UTF-18, UCS-4.
|
||||
|
||||
Tokenize prologs in a way useful for well-formedness checking
|
||||
|
@ -82,7 +80,7 @@ struct normal_encoding {
|
|||
#undef IS_NMSTRT_CHAR
|
||||
|
||||
const struct normal_encoding utf8_encoding = {
|
||||
{ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 1 },
|
||||
{ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 1 },
|
||||
#include "asciitab.h"
|
||||
#include "utf8tab.h"
|
||||
};
|
||||
|
@ -131,7 +129,7 @@ static int unicode_byte_type(char hi, char lo)
|
|||
#undef IS_NMSTRT_CHAR
|
||||
|
||||
const struct encoding little2_encoding = {
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 2
|
||||
};
|
||||
|
||||
#undef PREFIX
|
||||
|
@ -156,7 +154,7 @@ const struct encoding little2_encoding = {
|
|||
#undef IS_NMSTRT_CHAR
|
||||
|
||||
const struct encoding big2_encoding = {
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 2
|
||||
};
|
||||
|
||||
#undef PREFIX
|
||||
|
@ -215,10 +213,18 @@ int initScanContent(const ENCODING *enc, const char *ptr, const char *end,
|
|||
return initScan(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr);
|
||||
}
|
||||
|
||||
static
|
||||
void initUpdatePosition(const ENCODING *enc, const char *ptr,
|
||||
const char *end, POSITION *pos)
|
||||
{
|
||||
normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
|
||||
}
|
||||
|
||||
void XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr)
|
||||
{
|
||||
p->initEnc.scanners[XML_PROLOG_STATE] = initScanProlog;
|
||||
p->initEnc.scanners[XML_CONTENT_STATE] = initScanContent;
|
||||
p->initEnc.updatePosition = initUpdatePosition;
|
||||
p->initEnc.minBytesPerChar = 1;
|
||||
p->encPtr = encPtr;
|
||||
*encPtr = &(p->initEnc);
|
||||
|
|
|
@ -45,6 +45,15 @@ of the prolog and is also returned by XmlContentTok */
|
|||
#define XML_PROLOG_STATE 0
|
||||
#define XML_CONTENT_STATE 1
|
||||
|
||||
typedef struct position {
|
||||
/* first line and first column are 0 not 1 */
|
||||
unsigned long lineNumber;
|
||||
unsigned long columnNumber;
|
||||
/* if the last character counted was CR, then an immediately
|
||||
following LF should be ignored */
|
||||
int ignoreInitialLF;
|
||||
} POSITION;
|
||||
|
||||
typedef struct encoding {
|
||||
int (*scanners[XML_NSTATES])(const struct encoding *,
|
||||
const char *,
|
||||
|
@ -54,6 +63,10 @@ typedef struct encoding {
|
|||
const char *, const char *);
|
||||
int (*getAtts)(const struct encoding *enc, const char *ptr,
|
||||
int attsMax, const char **atts);
|
||||
void (*updatePosition)(const struct encoding *,
|
||||
const char *ptr,
|
||||
const char *end,
|
||||
POSITION *);
|
||||
int minBytesPerChar;
|
||||
} ENCODING;
|
||||
|
||||
|
@ -92,6 +105,9 @@ literals, comments and processing instructions.
|
|||
#define XmlGetAttributes(enc, ptr, attsMax, atts) \
|
||||
(((enc)->getAtts)(enc, ptr, attsMax, atts))
|
||||
|
||||
#define XmlUpdatePosition(enc, ptr, end, pos) \
|
||||
(((enc)->updatePosition)(enc, ptr, end, pos))
|
||||
|
||||
typedef struct {
|
||||
ENCODING initEnc;
|
||||
const ENCODING **encPtr;
|
||||
|
|
|
@ -162,7 +162,7 @@ int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
while (ptr != end) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
|
||||
case BT_S:
|
||||
case BT_S: case BT_CR: case BT_LF:
|
||||
ptr += MINBPC;
|
||||
while (ptr != end) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
|
@ -252,10 +252,10 @@ int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
while (ptr != end) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
|
||||
case BT_S:
|
||||
case BT_S: case BT_CR: case BT_LF:
|
||||
for (ptr += MINBPC; ptr != end; ptr += MINBPC) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
case BT_S:
|
||||
case BT_S: case BT_CR: case BT_LF:
|
||||
break;
|
||||
case BT_GT:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
|
@ -380,7 +380,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
while (ptr != end) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
|
||||
case BT_S:
|
||||
case BT_S: case BT_CR: case BT_LF:
|
||||
for (;;) {
|
||||
int t;
|
||||
|
||||
|
@ -390,7 +390,12 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
t = BYTE_TYPE(enc, ptr);
|
||||
if (t == BT_EQUALS)
|
||||
break;
|
||||
if (t != BT_S) {
|
||||
switch (t) {
|
||||
case BT_S:
|
||||
case BT_LF:
|
||||
case BT_CR:
|
||||
break;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
|
@ -407,7 +412,12 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
open = BYTE_TYPE(enc, ptr);
|
||||
if (open == BT_QUOT || open == BT_APOS)
|
||||
break;
|
||||
if (open != BT_S) {
|
||||
switch (open) {
|
||||
case BT_S:
|
||||
case BT_LF:
|
||||
case BT_CR:
|
||||
break;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
|
@ -449,7 +459,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
return XML_TOK_PARTIAL;
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
|
||||
case BT_S:
|
||||
case BT_S: case BT_CR: case BT_LF:
|
||||
continue;
|
||||
case BT_GT:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
|
@ -513,7 +523,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
while (ptr != end) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
|
||||
case BT_S:
|
||||
case BT_S: case BT_CR: case BT_LF:
|
||||
{
|
||||
ptr += MINBPC;
|
||||
while (ptr != end) {
|
||||
|
@ -523,7 +533,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
goto gt;
|
||||
case BT_SOL:
|
||||
goto sol;
|
||||
case BT_S:
|
||||
case BT_S: case BT_CR: case BT_LF:
|
||||
ptr += MINBPC;
|
||||
continue;
|
||||
default:
|
||||
|
@ -697,10 +707,19 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
case BT_S:
|
||||
do {
|
||||
case BT_S: case BT_CR: case BT_LF:
|
||||
for (;;) {
|
||||
ptr += MINBPC;
|
||||
} while (ptr != end && BYTE_TYPE(enc, ptr) == BT_S);
|
||||
if (ptr == end)
|
||||
break;
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
case BT_S: case BT_CR: case BT_LF:
|
||||
break;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_PROLOG_S;
|
||||
}
|
||||
}
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_PROLOG_S;
|
||||
default:
|
||||
|
@ -715,7 +734,7 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
case BT_NONXML:
|
||||
case BT_MALFORM:
|
||||
case BT_TRAIL:
|
||||
case BT_S:
|
||||
case BT_S: case BT_CR: case BT_LF:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_PROLOG_CHARS;
|
||||
MULTIBYTE_CASES(ptr, end, (*nextTokPtr = ptr, XML_TOK_PROLOG_CHARS))
|
||||
|
@ -774,7 +793,7 @@ int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
|
|||
else if (open == BT_APOS)
|
||||
state = other;
|
||||
break;
|
||||
case BT_S:
|
||||
case BT_S: case BT_CR: case BT_LF:
|
||||
/* This case ensures that the first attribute name is counted
|
||||
Apart from that we could just change state on the quote. */
|
||||
if (state == inName)
|
||||
|
@ -842,6 +861,47 @@ int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
|
|||
/* not reached */
|
||||
}
|
||||
|
||||
static
|
||||
void PREFIX(updatePosition)(const ENCODING *enc,
|
||||
const char *ptr,
|
||||
const char *end,
|
||||
POSITION *pos)
|
||||
{
|
||||
if (pos->ignoreInitialLF) {
|
||||
if (ptr == end)
|
||||
return;
|
||||
if (CHAR_MATCHES(enc, ptr, '\n'))
|
||||
ptr += MINBPC;
|
||||
pos->ignoreInitialLF = 0;
|
||||
}
|
||||
while (ptr != end) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
MULTIBYTE_CASES(ptr, end, ;/* hack! */)
|
||||
case BT_LF:
|
||||
pos->columnNumber = (unsigned)-1;
|
||||
pos->lineNumber++;
|
||||
ptr += MINBPC;
|
||||
break;
|
||||
case BT_CR:
|
||||
pos->lineNumber++;
|
||||
ptr += MINBPC;
|
||||
if (ptr == end) {
|
||||
pos->ignoreInitialLF = 1;
|
||||
pos->columnNumber = 0;
|
||||
return;
|
||||
}
|
||||
pos->columnNumber = (unsigned)-1;
|
||||
if (CHAR_MATCHES(enc, ptr, '\n'))
|
||||
ptr += MINBPC;
|
||||
break;
|
||||
default:
|
||||
ptr += MINBPC;
|
||||
break;
|
||||
}
|
||||
pos->columnNumber++;
|
||||
}
|
||||
}
|
||||
|
||||
#undef DO_LEAD_CASE
|
||||
#undef MULTIBYTE_CASES
|
||||
#undef INVALID_CASES
|
||||
|
|
|
@ -21,6 +21,8 @@ enum {
|
|||
BT_LSQB,
|
||||
BT_RSQB,
|
||||
BT_S,
|
||||
BT_CR,
|
||||
BT_LF,
|
||||
BT_NMSTRT,
|
||||
BT_HEX,
|
||||
BT_DIGIT,
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "wfcheck.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
@ -10,9 +11,14 @@
|
|||
static
|
||||
int skipProlog(const char **s, const char *end, const char **nextTokP,
|
||||
const ENCODING **enc);
|
||||
static
|
||||
void setPosition(const ENCODING *enc,
|
||||
const char *start, const char *end,
|
||||
const char **badPtr, unsigned long *badLine, unsigned long *badCol);
|
||||
|
||||
enum WfCheckResult
|
||||
wfCheck(const char *s, size_t n, const char **badPtr)
|
||||
wfCheck(const char *s, size_t n,
|
||||
const char **badPtr, unsigned long *badLine, unsigned long *badCol)
|
||||
{
|
||||
unsigned nElements = 0;
|
||||
unsigned nAtts = 0;
|
||||
|
@ -33,16 +39,16 @@ wfCheck(const char *s, size_t n, const char **badPtr)
|
|||
for (;;) {
|
||||
switch (tok) {
|
||||
case XML_TOK_NONE:
|
||||
*badPtr = s;
|
||||
setPosition(enc, start, s, badPtr, badLine, badCol);
|
||||
RETURN_CLEANUP(noElements);
|
||||
case XML_TOK_INVALID:
|
||||
*badPtr = next;
|
||||
setPosition(enc, start, next, badPtr, badLine, badCol);
|
||||
RETURN_CLEANUP(invalidToken);
|
||||
case XML_TOK_PARTIAL:
|
||||
*badPtr = s;
|
||||
setPosition(enc, start, s, badPtr, badLine, badCol);
|
||||
RETURN_CLEANUP(unclosedToken);
|
||||
case XML_TOK_PARTIAL_CHAR:
|
||||
*badPtr = s;
|
||||
setPosition(enc, start, s, badPtr, badLine, badCol);
|
||||
RETURN_CLEANUP(partialChar);
|
||||
case XML_TOK_EMPTY_ELEMENT_NO_ATTS:
|
||||
nElements++;
|
||||
|
@ -87,7 +93,7 @@ wfCheck(const char *s, size_t n, const char **badPtr)
|
|||
int j;
|
||||
for (j = 0; j < i; j++) {
|
||||
if (XmlSameName(enc, atts[i], atts[j])) {
|
||||
*badPtr = atts[i];
|
||||
setPosition(enc, start, atts[i], badPtr, badLine, badCol);
|
||||
RETURN_CLEANUP(duplicateAttribute);
|
||||
}
|
||||
}
|
||||
|
@ -97,7 +103,7 @@ wfCheck(const char *s, size_t n, const char **badPtr)
|
|||
case XML_TOK_END_TAG:
|
||||
--level;
|
||||
if (!XmlSameName(enc, startName[level], s + enc->minBytesPerChar * 2)) {
|
||||
*badPtr = s;
|
||||
setPosition(enc, start, s, badPtr, badLine, badCol);
|
||||
RETURN_CLEANUP(tagMismatch);
|
||||
}
|
||||
break;
|
||||
|
@ -116,7 +122,7 @@ wfCheck(const char *s, size_t n, const char **badPtr)
|
|||
break;
|
||||
default:
|
||||
if (tok > 0) {
|
||||
*badPtr = s;
|
||||
setPosition(enc, start, s, badPtr, badLine, badCol);
|
||||
RETURN_CLEANUP(junkAfterDocElement);
|
||||
}
|
||||
break;
|
||||
|
@ -156,3 +162,16 @@ int skipProlog(const char **startp, const char *end,
|
|||
}
|
||||
/* not reached */
|
||||
}
|
||||
|
||||
static
|
||||
void setPosition(const ENCODING *enc,
|
||||
const char *start, const char *end,
|
||||
const char **badPtr, unsigned long *badLine, unsigned long *badCol)
|
||||
{
|
||||
POSITION pos;
|
||||
memset(&pos, 0, sizeof(POSITION));
|
||||
XmlUpdatePosition(enc, start, end, &pos);
|
||||
*badPtr = end;
|
||||
*badLine = pos.lineNumber;
|
||||
*badCol = pos.columnNumber;
|
||||
}
|
||||
|
|
|
@ -13,5 +13,8 @@ enum WfCheckResult {
|
|||
junkAfterDocElement
|
||||
};
|
||||
|
||||
enum WfCheckResult wfCheck(const char *s, size_t n, const char **badPtr);
|
||||
enum WfCheckResult wfCheck(const char *s, size_t n,
|
||||
const char **errorPtr,
|
||||
unsigned long *errorLineNumber,
|
||||
unsigned long *errorColNumber);
|
||||
|
||||
|
|
|
@ -10,8 +10,11 @@ int doFile(const char *name)
|
|||
HANDLE f;
|
||||
HANDLE m;
|
||||
DWORD size;
|
||||
DWORD sizeHi;
|
||||
const char *p;
|
||||
const char *bad = 0;
|
||||
const char *badPtr = 0;
|
||||
unsigned long badLine = 0;
|
||||
unsigned long badCol = 0;
|
||||
int ret;
|
||||
enum WfCheckResult result;
|
||||
|
||||
|
@ -21,7 +24,16 @@ int doFile(const char *name)
|
|||
fprintf(stderr, "%s: CreateFile failed\n", name);
|
||||
return 0;
|
||||
}
|
||||
size = GetFileSize(f, NULL);
|
||||
size = GetFileSize(f, &sizeHi);
|
||||
if (sizeHi) {
|
||||
fprintf(stderr, "%s: too big (limit 2Gb)\n", name);
|
||||
return 0;
|
||||
}
|
||||
/* CreateFileMapping barfs on zero length files */
|
||||
if (size == 0) {
|
||||
fprintf(stderr, "%s: zero-length file\n", name);
|
||||
return 0;
|
||||
}
|
||||
m = CreateFileMapping(f, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
if (m == NULL) {
|
||||
fprintf(stderr, "%s: CreateFileMapping failed\n", name);
|
||||
|
@ -35,21 +47,23 @@ int doFile(const char *name)
|
|||
fprintf(stderr, "%s: MapViewOfFile failed\n", name);
|
||||
return 0;
|
||||
}
|
||||
result = wfCheck(p, size, &bad);
|
||||
result = wfCheck(p, size, &badPtr, &badLine, &badCol);
|
||||
if (result) {
|
||||
static const char *message[] = {
|
||||
0,
|
||||
"out of memory",
|
||||
"no element found",
|
||||
"invalid token after %lu bytes",
|
||||
"unclosed token started after %lu bytes",
|
||||
"unclosed token started after %lu bytes",
|
||||
"mismatched tag after %lu bytes",
|
||||
"duplicate attribute after %lu bytes",
|
||||
"junk after document element after %lu bytes",
|
||||
"invalid token",
|
||||
"unclosed token",
|
||||
"unclosed token",
|
||||
"mismatched tag",
|
||||
"duplicate attribute",
|
||||
"junk after document element",
|
||||
};
|
||||
fprintf(stderr, "%s: ", name);
|
||||
fprintf(stderr, message[result], (unsigned long)(bad - p));
|
||||
fprintf(stderr, "%s:", name);
|
||||
if (badPtr != 0)
|
||||
fprintf(stderr, "%lu:%lu:", badLine+1, badCol);
|
||||
fprintf(stderr, "E: %s", message[result]);
|
||||
putc('\n', stderr);
|
||||
ret = 1;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue