mirror of
https://github.com/libexpat/libexpat.git
synced 2025-04-05 05:05:00 +00:00
Add methods for getting attributes and comparing names.
This commit is contained in:
parent
1197596d3b
commit
69f0244382
3 changed files with 158 additions and 22 deletions
|
@ -1,5 +1,12 @@
|
|||
/* TODO
|
||||
|
||||
method to get name length
|
||||
method to extract attribute names (returns number of atts)
|
||||
size_t getAttributes(const char *ptr, const char *end, const char **atts, size_t maxAtts)
|
||||
|
||||
|
||||
Provide method to count lines/columns.
|
||||
|
||||
Provide methods to convert to any of UTF-8, UTF-18, UCS-4.
|
||||
|
||||
Better prolog tokenization
|
||||
|
@ -9,6 +16,8 @@ NMTOKEN
|
|||
NAME
|
||||
PEREF
|
||||
|
||||
MatchEndTag(endTagStart, endTagEnd, startTagPtr, startTagEnd)
|
||||
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
@ -78,7 +87,7 @@ struct normal_encoding {
|
|||
#undef IS_NMSTRT_CHAR
|
||||
|
||||
const struct normal_encoding utf8_encoding = {
|
||||
{ { PREFIX(prologTok), PREFIX(contentTok) }, 1 },
|
||||
{ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 1 },
|
||||
#include "asciitab.h"
|
||||
#include "utf8tab.h"
|
||||
};
|
||||
|
@ -127,7 +136,7 @@ static int unicode_byte_type(char hi, char lo)
|
|||
#undef IS_NMSTRT_CHAR
|
||||
|
||||
const struct encoding little2_encoding = {
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, 2
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
|
||||
};
|
||||
|
||||
#undef PREFIX
|
||||
|
@ -152,7 +161,7 @@ const struct encoding little2_encoding = {
|
|||
#undef IS_NMSTRT_CHAR
|
||||
|
||||
const struct encoding big2_encoding = {
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, 2
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
|
||||
};
|
||||
|
||||
#undef PREFIX
|
||||
|
|
|
@ -14,28 +14,32 @@ extern "C" {
|
|||
#define XML_TOK_PARTIAL_CHAR -2 /* only part of a multibyte sequence */
|
||||
#define XML_TOK_PARTIAL -1 /* only part of a token */
|
||||
#define XML_TOK_INVALID 0
|
||||
#define XML_TOK_BOM 1 /* Byte order mark */
|
||||
#define XML_TOK_COMMENT 2
|
||||
#define XML_TOK_PI 3 /* processing instruction */
|
||||
|
||||
/* The following tokens are returned only by XmlPrologTok */
|
||||
#define XML_TOK_LITERAL 4
|
||||
#define XML_TOK_PROLOG_CHARS 5
|
||||
#define XML_TOK_PROLOG_S 6
|
||||
|
||||
/* The following token is returned by XmlPrologTok when it detects the end
|
||||
of the prolog and is also returned by XmlContentTok */
|
||||
|
||||
#define XML_TOK_START_TAG 7
|
||||
#define XML_TOK_START_TAG_WITH_ATTS 1
|
||||
#define XML_TOK_START_TAG_NO_ATTS 2
|
||||
#define XML_TOK_EMPTY_ELEMENT_WITH_ATTS 3 /* empty element tag <e/> */
|
||||
#define XML_TOK_EMPTY_ELEMENT_NO_ATTS 4
|
||||
|
||||
/* The following tokens are returned only by XmlContentTok */
|
||||
|
||||
#define XML_TOK_END_TAG 8
|
||||
#define XML_TOK_EMPTY_ELEMENT 9 /* empty element tag <e/> */
|
||||
#define XML_TOK_DATA_CHARS 10
|
||||
#define XML_TOK_CDATA_SECTION 11
|
||||
#define XML_TOK_ENTITY_REF 12
|
||||
#define XML_TOK_CHAR_REF 13 /* numeric character reference */
|
||||
#define XML_TOK_END_TAG 5
|
||||
#define XML_TOK_DATA_CHARS 6
|
||||
#define XML_TOK_CDATA_SECTION 7
|
||||
#define XML_TOK_ENTITY_REF 8
|
||||
#define XML_TOK_CHAR_REF 9 /* numeric character reference */
|
||||
|
||||
/* The following tokens may be returned by both XmlPrologTok and XmlContentTok */
|
||||
#define XML_TOK_PI 10 /* processing instruction */
|
||||
#define XML_TOK_COMMENT 11
|
||||
#define XML_TOK_BOM 12 /* Byte order mark */
|
||||
|
||||
/* The following tokens are returned only by XmlPrologTok */
|
||||
#define XML_TOK_LITERAL 13
|
||||
#define XML_TOK_PROLOG_CHARS 14
|
||||
#define XML_TOK_PROLOG_S 15
|
||||
|
||||
#define XML_NSTATES 2
|
||||
#define XML_PROLOG_STATE 0
|
||||
|
@ -46,6 +50,10 @@ typedef struct encoding {
|
|||
const char *,
|
||||
const char *,
|
||||
const char **);
|
||||
int (*sameName)(const struct encoding *,
|
||||
const char *, const char *);
|
||||
int (*getAtts)(const struct encoding *enc, const char *ptr,
|
||||
int attsMax, const char **atts);
|
||||
int minBytesPerChar;
|
||||
} ENCODING;
|
||||
|
||||
|
@ -79,6 +87,11 @@ literals, comments and processing instructions.
|
|||
#define XmlContentTok(enc, ptr, end, nextTokPtr) \
|
||||
XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
|
||||
|
||||
#define XmlSameName(enc, ptr1, ptr2) (((enc)->sameName)(enc, ptr1, ptr2))
|
||||
|
||||
#define XmlGetAttributes(enc, ptr, attsMax, atts) \
|
||||
(((enc)->getAtts)(enc, ptr, attsMax, atts))
|
||||
|
||||
typedef struct {
|
||||
ENCODING initEnc;
|
||||
const ENCODING **encPtr;
|
||||
|
|
|
@ -453,7 +453,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
continue;
|
||||
case BT_GT:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_START_TAG;
|
||||
return XML_TOK_START_TAG_WITH_ATTS;
|
||||
case BT_SOL:
|
||||
ptr += MINBPC;
|
||||
if (ptr == end)
|
||||
|
@ -463,7 +463,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
return XML_TOK_INVALID;
|
||||
}
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_EMPTY_ELEMENT;
|
||||
return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
|
@ -537,7 +537,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
case BT_GT:
|
||||
gt:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_START_TAG;
|
||||
return XML_TOK_START_TAG_NO_ATTS;
|
||||
case BT_SOL:
|
||||
sol:
|
||||
ptr += MINBPC;
|
||||
|
@ -548,7 +548,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
return XML_TOK_INVALID;
|
||||
}
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_EMPTY_ELEMENT;
|
||||
return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
|
@ -728,6 +728,120 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
return XML_TOK_PROLOG_CHARS;
|
||||
}
|
||||
|
||||
/* This must only be called for a well-formed start-tag or empty element tag.
|
||||
Returns the number of attributes. Pointers to the names of up to the first
|
||||
attsMax attributes are stored in atts. */
|
||||
static
|
||||
int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
|
||||
int attsMax, const char **atts)
|
||||
{
|
||||
enum { other, inName, inValue } state = inName;
|
||||
int nAtts = 0;
|
||||
int open;
|
||||
|
||||
for (ptr += MINBPC;; ptr += MINBPC) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
#define START_NAME \
|
||||
if (state == other) { \
|
||||
if (nAtts < attsMax) \
|
||||
atts[nAtts] = ptr; \
|
||||
++nAtts; \
|
||||
state = inName; \
|
||||
}
|
||||
#define LEAD_CASE(n) \
|
||||
case BT_LEAD ## n: START_NAME ptr += (n - MINBPC); break;
|
||||
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) LEAD_CASE(5) LEAD_CASE(6)
|
||||
#undef LEAD_CASE
|
||||
case BT_NONASCII:
|
||||
case BT_NMSTRT:
|
||||
case BT_HEX:
|
||||
START_NAME
|
||||
break;
|
||||
#undef START_NAME
|
||||
case BT_QUOT:
|
||||
if (state == other) {
|
||||
state = inValue;
|
||||
open = BT_QUOT;
|
||||
}
|
||||
else if (open == BT_QUOT)
|
||||
state = other;
|
||||
break;
|
||||
case BT_APOS:
|
||||
if (state == other) {
|
||||
state = inValue;
|
||||
open = BT_APOS;
|
||||
}
|
||||
else if (open == BT_APOS)
|
||||
state = other;
|
||||
break;
|
||||
case BT_S:
|
||||
/* This case ensures that the first attribute name is counted
|
||||
Apart from that we could just change state on the quote. */
|
||||
if (state == inName)
|
||||
state = other;
|
||||
break;
|
||||
case BT_GT:
|
||||
case BT_SOL:
|
||||
if (state != inValue)
|
||||
return nAtts;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* not reached */
|
||||
}
|
||||
|
||||
static
|
||||
int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
|
||||
{
|
||||
for (;;) {
|
||||
switch (BYTE_TYPE(enc, ptr1)) {
|
||||
#define LEAD_CASE(n) \
|
||||
case BT_LEAD ## n: \
|
||||
if (*ptr1++ != *ptr2++) \
|
||||
return 0;
|
||||
LEAD_CASE(6) LEAD_CASE(5) LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
|
||||
#undef LEAD_CASE
|
||||
/* fall through */
|
||||
if (*ptr1++ != *ptr2++)
|
||||
return 0;
|
||||
break;
|
||||
case BT_NONASCII:
|
||||
case BT_NMSTRT:
|
||||
case BT_HEX:
|
||||
case BT_DIGIT:
|
||||
case BT_NAME:
|
||||
case BT_MINUS:
|
||||
if (*ptr2 != *ptr1)
|
||||
return 0;
|
||||
ptr1 += MINBPC;
|
||||
ptr2 += MINBPC;
|
||||
break;
|
||||
default:
|
||||
if (*ptr1 == *ptr2)
|
||||
return 1;
|
||||
switch (BYTE_TYPE(enc, ptr2)) {
|
||||
case BT_LEAD2:
|
||||
case BT_LEAD3:
|
||||
case BT_LEAD4:
|
||||
case BT_LEAD5:
|
||||
case BT_LEAD6:
|
||||
case BT_NONASCII:
|
||||
case BT_NMSTRT:
|
||||
case BT_HEX:
|
||||
case BT_DIGIT:
|
||||
case BT_NAME:
|
||||
case BT_MINUS:
|
||||
return 0;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* not reached */
|
||||
}
|
||||
|
||||
#undef DO_LEAD_CASE
|
||||
#undef MULTIBYTE_CASES
|
||||
#undef INVALID_CASES
|
||||
|
|
Loading…
Add table
Reference in a new issue