Support for line and column numbers

This commit is contained in:
James Clark 1997-11-12 10:38:58 +00:00
parent 84be77b536
commit 9651443ca7
8 changed files with 161 additions and 41 deletions

View file

@ -1,7 +1,7 @@
/* 0x00 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x04 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x08 */ BT_NONXML, BT_S, BT_S, BT_NONXML,
/* 0x0C */ BT_NONXML, BT_S, BT_NONXML, BT_NONXML,
/* 0x08 */ BT_NONXML, BT_S, BT_LF, BT_NONXML,
/* 0x0C */ BT_NONXML, BT_CR, BT_NONXML, BT_NONXML,
/* 0x10 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x14 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x18 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,

View file

@ -2,8 +2,6 @@
Provide method to get name length.
Provide method to count lines/columns.
Provide methods to convert to any of UTF-8, UTF-18, UCS-4.
Tokenize prologs in a way useful for well-formedness checking
@ -82,7 +80,7 @@ struct normal_encoding {
#undef IS_NMSTRT_CHAR
const struct normal_encoding utf8_encoding = {
{ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 1 },
{ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 1 },
#include "asciitab.h"
#include "utf8tab.h"
};
@ -131,7 +129,7 @@ static int unicode_byte_type(char hi, char lo)
#undef IS_NMSTRT_CHAR
const struct encoding little2_encoding = {
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 2
};
#undef PREFIX
@ -156,7 +154,7 @@ const struct encoding little2_encoding = {
#undef IS_NMSTRT_CHAR
const struct encoding big2_encoding = {
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 2
};
#undef PREFIX
@ -215,10 +213,18 @@ int initScanContent(const ENCODING *enc, const char *ptr, const char *end,
return initScan(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr);
}
static
void initUpdatePosition(const ENCODING *enc, const char *ptr,
const char *end, POSITION *pos)
{
normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
}
void XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr)
{
p->initEnc.scanners[XML_PROLOG_STATE] = initScanProlog;
p->initEnc.scanners[XML_CONTENT_STATE] = initScanContent;
p->initEnc.updatePosition = initUpdatePosition;
p->initEnc.minBytesPerChar = 1;
p->encPtr = encPtr;
*encPtr = &(p->initEnc);

View file

@ -45,6 +45,15 @@ of the prolog and is also returned by XmlContentTok */
#define XML_PROLOG_STATE 0
#define XML_CONTENT_STATE 1
typedef struct position {
/* first line and first column are 0 not 1 */
unsigned long lineNumber;
unsigned long columnNumber;
/* if the last character counted was CR, then an immediately
following LF should be ignored */
int ignoreInitialLF;
} POSITION;
typedef struct encoding {
int (*scanners[XML_NSTATES])(const struct encoding *,
const char *,
@ -54,6 +63,10 @@ typedef struct encoding {
const char *, const char *);
int (*getAtts)(const struct encoding *enc, const char *ptr,
int attsMax, const char **atts);
void (*updatePosition)(const struct encoding *,
const char *ptr,
const char *end,
POSITION *);
int minBytesPerChar;
} ENCODING;
@ -92,6 +105,9 @@ literals, comments and processing instructions.
#define XmlGetAttributes(enc, ptr, attsMax, atts) \
(((enc)->getAtts)(enc, ptr, attsMax, atts))
#define XmlUpdatePosition(enc, ptr, end, pos) \
(((enc)->updatePosition)(enc, ptr, end, pos))
typedef struct {
ENCODING initEnc;
const ENCODING **encPtr;

View file

@ -162,7 +162,7 @@ int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
case BT_S:
case BT_S: case BT_CR: case BT_LF:
ptr += MINBPC;
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
@ -252,10 +252,10 @@ int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
case BT_S:
case BT_S: case BT_CR: case BT_LF:
for (ptr += MINBPC; ptr != end; ptr += MINBPC) {
switch (BYTE_TYPE(enc, ptr)) {
case BT_S:
case BT_S: case BT_CR: case BT_LF:
break;
case BT_GT:
*nextTokPtr = ptr + MINBPC;
@ -380,7 +380,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
case BT_S:
case BT_S: case BT_CR: case BT_LF:
for (;;) {
int t;
@ -390,7 +390,12 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
t = BYTE_TYPE(enc, ptr);
if (t == BT_EQUALS)
break;
if (t != BT_S) {
switch (t) {
case BT_S:
case BT_LF:
case BT_CR:
break;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
@ -407,7 +412,12 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
open = BYTE_TYPE(enc, ptr);
if (open == BT_QUOT || open == BT_APOS)
break;
if (open != BT_S) {
switch (open) {
case BT_S:
case BT_LF:
case BT_CR:
break;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
@ -449,7 +459,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
return XML_TOK_PARTIAL;
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
case BT_S:
case BT_S: case BT_CR: case BT_LF:
continue;
case BT_GT:
*nextTokPtr = ptr + MINBPC;
@ -513,7 +523,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
case BT_S:
case BT_S: case BT_CR: case BT_LF:
{
ptr += MINBPC;
while (ptr != end) {
@ -523,7 +533,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
goto gt;
case BT_SOL:
goto sol;
case BT_S:
case BT_S: case BT_CR: case BT_LF:
ptr += MINBPC;
continue;
default:
@ -697,10 +707,19 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
case BT_S:
do {
case BT_S: case BT_CR: case BT_LF:
for (;;) {
ptr += MINBPC;
} while (ptr != end && BYTE_TYPE(enc, ptr) == BT_S);
if (ptr == end)
break;
switch (BYTE_TYPE(enc, ptr)) {
case BT_S: case BT_CR: case BT_LF:
break;
default:
*nextTokPtr = ptr;
return XML_TOK_PROLOG_S;
}
}
*nextTokPtr = ptr;
return XML_TOK_PROLOG_S;
default:
@ -715,7 +734,7 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
case BT_NONXML:
case BT_MALFORM:
case BT_TRAIL:
case BT_S:
case BT_S: case BT_CR: case BT_LF:
*nextTokPtr = ptr;
return XML_TOK_PROLOG_CHARS;
MULTIBYTE_CASES(ptr, end, (*nextTokPtr = ptr, XML_TOK_PROLOG_CHARS))
@ -774,7 +793,7 @@ int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
else if (open == BT_APOS)
state = other;
break;
case BT_S:
case BT_S: case BT_CR: case BT_LF:
/* This case ensures that the first attribute name is counted
Apart from that we could just change state on the quote. */
if (state == inName)
@ -842,6 +861,47 @@ int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
/* not reached */
}
static
void PREFIX(updatePosition)(const ENCODING *enc,
const char *ptr,
const char *end,
POSITION *pos)
{
if (pos->ignoreInitialLF) {
if (ptr == end)
return;
if (CHAR_MATCHES(enc, ptr, '\n'))
ptr += MINBPC;
pos->ignoreInitialLF = 0;
}
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
MULTIBYTE_CASES(ptr, end, ;/* hack! */)
case BT_LF:
pos->columnNumber = (unsigned)-1;
pos->lineNumber++;
ptr += MINBPC;
break;
case BT_CR:
pos->lineNumber++;
ptr += MINBPC;
if (ptr == end) {
pos->ignoreInitialLF = 1;
pos->columnNumber = 0;
return;
}
pos->columnNumber = (unsigned)-1;
if (CHAR_MATCHES(enc, ptr, '\n'))
ptr += MINBPC;
break;
default:
ptr += MINBPC;
break;
}
pos->columnNumber++;
}
}
#undef DO_LEAD_CASE
#undef MULTIBYTE_CASES
#undef INVALID_CASES

View file

@ -21,6 +21,8 @@ enum {
BT_LSQB,
BT_RSQB,
BT_S,
BT_CR,
BT_LF,
BT_NMSTRT,
BT_HEX,
BT_DIGIT,

View file

@ -1,4 +1,5 @@
#include <stdlib.h>
#include <string.h>
#include "wfcheck.h"
#ifdef _MSC_VER
@ -10,9 +11,14 @@
static
int skipProlog(const char **s, const char *end, const char **nextTokP,
const ENCODING **enc);
static
void setPosition(const ENCODING *enc,
const char *start, const char *end,
const char **badPtr, unsigned long *badLine, unsigned long *badCol);
enum WfCheckResult
wfCheck(const char *s, size_t n, const char **badPtr)
wfCheck(const char *s, size_t n,
const char **badPtr, unsigned long *badLine, unsigned long *badCol)
{
unsigned nElements = 0;
unsigned nAtts = 0;
@ -33,16 +39,16 @@ wfCheck(const char *s, size_t n, const char **badPtr)
for (;;) {
switch (tok) {
case XML_TOK_NONE:
*badPtr = s;
setPosition(enc, start, s, badPtr, badLine, badCol);
RETURN_CLEANUP(noElements);
case XML_TOK_INVALID:
*badPtr = next;
setPosition(enc, start, next, badPtr, badLine, badCol);
RETURN_CLEANUP(invalidToken);
case XML_TOK_PARTIAL:
*badPtr = s;
setPosition(enc, start, s, badPtr, badLine, badCol);
RETURN_CLEANUP(unclosedToken);
case XML_TOK_PARTIAL_CHAR:
*badPtr = s;
setPosition(enc, start, s, badPtr, badLine, badCol);
RETURN_CLEANUP(partialChar);
case XML_TOK_EMPTY_ELEMENT_NO_ATTS:
nElements++;
@ -87,7 +93,7 @@ wfCheck(const char *s, size_t n, const char **badPtr)
int j;
for (j = 0; j < i; j++) {
if (XmlSameName(enc, atts[i], atts[j])) {
*badPtr = atts[i];
setPosition(enc, start, atts[i], badPtr, badLine, badCol);
RETURN_CLEANUP(duplicateAttribute);
}
}
@ -97,7 +103,7 @@ wfCheck(const char *s, size_t n, const char **badPtr)
case XML_TOK_END_TAG:
--level;
if (!XmlSameName(enc, startName[level], s + enc->minBytesPerChar * 2)) {
*badPtr = s;
setPosition(enc, start, s, badPtr, badLine, badCol);
RETURN_CLEANUP(tagMismatch);
}
break;
@ -116,7 +122,7 @@ wfCheck(const char *s, size_t n, const char **badPtr)
break;
default:
if (tok > 0) {
*badPtr = s;
setPosition(enc, start, s, badPtr, badLine, badCol);
RETURN_CLEANUP(junkAfterDocElement);
}
break;
@ -156,3 +162,16 @@ int skipProlog(const char **startp, const char *end,
}
/* not reached */
}
static
void setPosition(const ENCODING *enc,
const char *start, const char *end,
const char **badPtr, unsigned long *badLine, unsigned long *badCol)
{
POSITION pos;
memset(&pos, 0, sizeof(POSITION));
XmlUpdatePosition(enc, start, end, &pos);
*badPtr = end;
*badLine = pos.lineNumber;
*badCol = pos.columnNumber;
}

View file

@ -13,5 +13,8 @@ enum WfCheckResult {
junkAfterDocElement
};
enum WfCheckResult wfCheck(const char *s, size_t n, const char **badPtr);
enum WfCheckResult wfCheck(const char *s, size_t n,
const char **errorPtr,
unsigned long *errorLineNumber,
unsigned long *errorColNumber);

View file

@ -10,8 +10,11 @@ int doFile(const char *name)
HANDLE f;
HANDLE m;
DWORD size;
DWORD sizeHi;
const char *p;
const char *bad = 0;
const char *badPtr = 0;
unsigned long badLine = 0;
unsigned long badCol = 0;
int ret;
enum WfCheckResult result;
@ -21,7 +24,16 @@ int doFile(const char *name)
fprintf(stderr, "%s: CreateFile failed\n", name);
return 0;
}
size = GetFileSize(f, NULL);
size = GetFileSize(f, &sizeHi);
if (sizeHi) {
fprintf(stderr, "%s: too big (limit 2Gb)\n", name);
return 0;
}
/* CreateFileMapping barfs on zero length files */
if (size == 0) {
fprintf(stderr, "%s: zero-length file\n", name);
return 0;
}
m = CreateFileMapping(f, NULL, PAGE_READONLY, 0, 0, NULL);
if (m == NULL) {
fprintf(stderr, "%s: CreateFileMapping failed\n", name);
@ -35,21 +47,23 @@ int doFile(const char *name)
fprintf(stderr, "%s: MapViewOfFile failed\n", name);
return 0;
}
result = wfCheck(p, size, &bad);
result = wfCheck(p, size, &badPtr, &badLine, &badCol);
if (result) {
static const char *message[] = {
0,
"out of memory",
"no element found",
"invalid token after %lu bytes",
"unclosed token started after %lu bytes",
"unclosed token started after %lu bytes",
"mismatched tag after %lu bytes",
"duplicate attribute after %lu bytes",
"junk after document element after %lu bytes",
"invalid token",
"unclosed token",
"unclosed token",
"mismatched tag",
"duplicate attribute",
"junk after document element",
};
fprintf(stderr, "%s: ", name);
fprintf(stderr, message[result], (unsigned long)(bad - p));
fprintf(stderr, "%s:", name);
if (badPtr != 0)
fprintf(stderr, "%lu:%lu:", badLine+1, badCol);
fprintf(stderr, "E: %s", message[result]);
putc('\n', stderr);
ret = 1;
}