Add methods for getting attributes and comparing names.

This commit is contained in:
James Clark 1997-11-11 05:53:20 +00:00
parent 1197596d3b
commit 69f0244382
3 changed files with 158 additions and 22 deletions

View file

@ -1,5 +1,12 @@
/* TODO
method to get name length
method to extract attribute names (returns number of atts)
size_t getAttributes(const char *ptr, const char *end, const char **atts, size_t maxAtts)
Provide method to count lines/columns.
Provide methods to convert to any of UTF-8, UTF-18, UCS-4.
Better prolog tokenization
@ -9,6 +16,8 @@ NMTOKEN
NAME
PEREF
MatchEndTag(endTagStart, endTagEnd, startTagPtr, startTagEnd)
*/
#ifdef _MSC_VER
@ -78,7 +87,7 @@ struct normal_encoding {
#undef IS_NMSTRT_CHAR
const struct normal_encoding utf8_encoding = {
{ { PREFIX(prologTok), PREFIX(contentTok) }, 1 },
{ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 1 },
#include "asciitab.h"
#include "utf8tab.h"
};
@ -127,7 +136,7 @@ static int unicode_byte_type(char hi, char lo)
#undef IS_NMSTRT_CHAR
const struct encoding little2_encoding = {
{ PREFIX(prologTok), PREFIX(contentTok) }, 2
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
};
#undef PREFIX
@ -152,7 +161,7 @@ const struct encoding little2_encoding = {
#undef IS_NMSTRT_CHAR
const struct encoding big2_encoding = {
{ PREFIX(prologTok), PREFIX(contentTok) }, 2
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
};
#undef PREFIX

View file

@ -14,28 +14,32 @@ extern "C" {
#define XML_TOK_PARTIAL_CHAR -2 /* only part of a multibyte sequence */
#define XML_TOK_PARTIAL -1 /* only part of a token */
#define XML_TOK_INVALID 0
#define XML_TOK_BOM 1 /* Byte order mark */
#define XML_TOK_COMMENT 2
#define XML_TOK_PI 3 /* processing instruction */
/* The following tokens are returned only by XmlPrologTok */
#define XML_TOK_LITERAL 4
#define XML_TOK_PROLOG_CHARS 5
#define XML_TOK_PROLOG_S 6
/* The following token is returned by XmlPrologTok when it detects the end
of the prolog and is also returned by XmlContentTok */
#define XML_TOK_START_TAG 7
#define XML_TOK_START_TAG_WITH_ATTS 1
#define XML_TOK_START_TAG_NO_ATTS 2
#define XML_TOK_EMPTY_ELEMENT_WITH_ATTS 3 /* empty element tag <e/> */
#define XML_TOK_EMPTY_ELEMENT_NO_ATTS 4
/* The following tokens are returned only by XmlContentTok */
#define XML_TOK_END_TAG 8
#define XML_TOK_EMPTY_ELEMENT 9 /* empty element tag <e/> */
#define XML_TOK_DATA_CHARS 10
#define XML_TOK_CDATA_SECTION 11
#define XML_TOK_ENTITY_REF 12
#define XML_TOK_CHAR_REF 13 /* numeric character reference */
#define XML_TOK_END_TAG 5
#define XML_TOK_DATA_CHARS 6
#define XML_TOK_CDATA_SECTION 7
#define XML_TOK_ENTITY_REF 8
#define XML_TOK_CHAR_REF 9 /* numeric character reference */
/* The following tokens may be returned by both XmlPrologTok and XmlContentTok */
#define XML_TOK_PI 10 /* processing instruction */
#define XML_TOK_COMMENT 11
#define XML_TOK_BOM 12 /* Byte order mark */
/* The following tokens are returned only by XmlPrologTok */
#define XML_TOK_LITERAL 13
#define XML_TOK_PROLOG_CHARS 14
#define XML_TOK_PROLOG_S 15
#define XML_NSTATES 2
#define XML_PROLOG_STATE 0
@ -46,6 +50,10 @@ typedef struct encoding {
const char *,
const char *,
const char **);
int (*sameName)(const struct encoding *,
const char *, const char *);
int (*getAtts)(const struct encoding *enc, const char *ptr,
int attsMax, const char **atts);
int minBytesPerChar;
} ENCODING;
@ -79,6 +87,11 @@ literals, comments and processing instructions.
#define XmlContentTok(enc, ptr, end, nextTokPtr) \
XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
#define XmlSameName(enc, ptr1, ptr2) (((enc)->sameName)(enc, ptr1, ptr2))
#define XmlGetAttributes(enc, ptr, attsMax, atts) \
(((enc)->getAtts)(enc, ptr, attsMax, atts))
typedef struct {
ENCODING initEnc;
const ENCODING **encPtr;

View file

@ -453,7 +453,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
continue;
case BT_GT:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_START_TAG;
return XML_TOK_START_TAG_WITH_ATTS;
case BT_SOL:
ptr += MINBPC;
if (ptr == end)
@ -463,7 +463,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
return XML_TOK_INVALID;
}
*nextTokPtr = ptr + MINBPC;
return XML_TOK_EMPTY_ELEMENT;
return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
@ -537,7 +537,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
case BT_GT:
gt:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_START_TAG;
return XML_TOK_START_TAG_NO_ATTS;
case BT_SOL:
sol:
ptr += MINBPC;
@ -548,7 +548,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
return XML_TOK_INVALID;
}
*nextTokPtr = ptr + MINBPC;
return XML_TOK_EMPTY_ELEMENT;
return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
@ -728,6 +728,120 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
return XML_TOK_PROLOG_CHARS;
}
/* This must only be called for a well-formed start-tag or empty element tag.
Returns the number of attributes. Pointers to the names of up to the first
attsMax attributes are stored in atts. */
static
int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
int attsMax, const char **atts)
{
enum { other, inName, inValue } state = inName;
int nAtts = 0;
int open;
for (ptr += MINBPC;; ptr += MINBPC) {
switch (BYTE_TYPE(enc, ptr)) {
#define START_NAME \
if (state == other) { \
if (nAtts < attsMax) \
atts[nAtts] = ptr; \
++nAtts; \
state = inName; \
}
#define LEAD_CASE(n) \
case BT_LEAD ## n: START_NAME ptr += (n - MINBPC); break;
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) LEAD_CASE(5) LEAD_CASE(6)
#undef LEAD_CASE
case BT_NONASCII:
case BT_NMSTRT:
case BT_HEX:
START_NAME
break;
#undef START_NAME
case BT_QUOT:
if (state == other) {
state = inValue;
open = BT_QUOT;
}
else if (open == BT_QUOT)
state = other;
break;
case BT_APOS:
if (state == other) {
state = inValue;
open = BT_APOS;
}
else if (open == BT_APOS)
state = other;
break;
case BT_S:
/* This case ensures that the first attribute name is counted
Apart from that we could just change state on the quote. */
if (state == inName)
state = other;
break;
case BT_GT:
case BT_SOL:
if (state != inValue)
return nAtts;
break;
default:
break;
}
}
/* not reached */
}
static
int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
{
for (;;) {
switch (BYTE_TYPE(enc, ptr1)) {
#define LEAD_CASE(n) \
case BT_LEAD ## n: \
if (*ptr1++ != *ptr2++) \
return 0;
LEAD_CASE(6) LEAD_CASE(5) LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
#undef LEAD_CASE
/* fall through */
if (*ptr1++ != *ptr2++)
return 0;
break;
case BT_NONASCII:
case BT_NMSTRT:
case BT_HEX:
case BT_DIGIT:
case BT_NAME:
case BT_MINUS:
if (*ptr2 != *ptr1)
return 0;
ptr1 += MINBPC;
ptr2 += MINBPC;
break;
default:
if (*ptr1 == *ptr2)
return 1;
switch (BYTE_TYPE(enc, ptr2)) {
case BT_LEAD2:
case BT_LEAD3:
case BT_LEAD4:
case BT_LEAD5:
case BT_LEAD6:
case BT_NONASCII:
case BT_NMSTRT:
case BT_HEX:
case BT_DIGIT:
case BT_NAME:
case BT_MINUS:
return 0;
default:
return 1;
}
}
}
/* not reached */
}
#undef DO_LEAD_CASE
#undef MULTIBYTE_CASES
#undef INVALID_CASES