diff --git a/expat/xmltok/xmltok.c b/expat/xmltok/xmltok.c
index f1a0d351..b0693c85 100755
--- a/expat/xmltok/xmltok.c
+++ b/expat/xmltok/xmltok.c
@@ -1,5 +1,12 @@
/* TODO
+method to get name length
+method to extract attribute names (returns number of atts)
+size_t getAttributes(const char *ptr, const char *end, const char **atts, size_t maxAtts)
+
+
+Provide method to count lines/columns.
+
Provide methods to convert to any of UTF-8, UTF-18, UCS-4.
Better prolog tokenization
@@ -9,6 +16,8 @@ NMTOKEN
NAME
PEREF
+MatchEndTag(endTagStart, endTagEnd, startTagPtr, startTagEnd)
+
*/
#ifdef _MSC_VER
@@ -78,7 +87,7 @@ struct normal_encoding {
#undef IS_NMSTRT_CHAR
const struct normal_encoding utf8_encoding = {
- { { PREFIX(prologTok), PREFIX(contentTok) }, 1 },
+ { { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 1 },
#include "asciitab.h"
#include "utf8tab.h"
};
@@ -127,7 +136,7 @@ static int unicode_byte_type(char hi, char lo)
#undef IS_NMSTRT_CHAR
const struct encoding little2_encoding = {
- { PREFIX(prologTok), PREFIX(contentTok) }, 2
+ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
};
#undef PREFIX
@@ -152,7 +161,7 @@ const struct encoding little2_encoding = {
#undef IS_NMSTRT_CHAR
const struct encoding big2_encoding = {
- { PREFIX(prologTok), PREFIX(contentTok) }, 2
+ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
};
#undef PREFIX
diff --git a/expat/xmltok/xmltok.h b/expat/xmltok/xmltok.h
index 511251e7..3583e068 100755
--- a/expat/xmltok/xmltok.h
+++ b/expat/xmltok/xmltok.h
@@ -14,28 +14,32 @@ extern "C" {
#define XML_TOK_PARTIAL_CHAR -2 /* only part of a multibyte sequence */
#define XML_TOK_PARTIAL -1 /* only part of a token */
#define XML_TOK_INVALID 0
-#define XML_TOK_BOM 1 /* Byte order mark */
-#define XML_TOK_COMMENT 2
-#define XML_TOK_PI 3 /* processing instruction */
-
-/* The following tokens are returned only by XmlPrologTok */
-#define XML_TOK_LITERAL 4
-#define XML_TOK_PROLOG_CHARS 5
-#define XML_TOK_PROLOG_S 6
/* The following token is returned by XmlPrologTok when it detects the end
of the prolog and is also returned by XmlContentTok */
-#define XML_TOK_START_TAG 7
+#define XML_TOK_START_TAG_WITH_ATTS 1
+#define XML_TOK_START_TAG_NO_ATTS 2
+#define XML_TOK_EMPTY_ELEMENT_WITH_ATTS 3 /* empty element tag */
+#define XML_TOK_EMPTY_ELEMENT_NO_ATTS 4
/* The following tokens are returned only by XmlContentTok */
-#define XML_TOK_END_TAG 8
-#define XML_TOK_EMPTY_ELEMENT 9 /* empty element tag */
-#define XML_TOK_DATA_CHARS 10
-#define XML_TOK_CDATA_SECTION 11
-#define XML_TOK_ENTITY_REF 12
-#define XML_TOK_CHAR_REF 13 /* numeric character reference */
+#define XML_TOK_END_TAG 5
+#define XML_TOK_DATA_CHARS 6
+#define XML_TOK_CDATA_SECTION 7
+#define XML_TOK_ENTITY_REF 8
+#define XML_TOK_CHAR_REF 9 /* numeric character reference */
+
+/* The following tokens may be returned by both XmlPrologTok and XmlContentTok */
+#define XML_TOK_PI 10 /* processing instruction */
+#define XML_TOK_COMMENT 11
+#define XML_TOK_BOM 12 /* Byte order mark */
+
+/* The following tokens are returned only by XmlPrologTok */
+#define XML_TOK_LITERAL 13
+#define XML_TOK_PROLOG_CHARS 14
+#define XML_TOK_PROLOG_S 15
#define XML_NSTATES 2
#define XML_PROLOG_STATE 0
@@ -46,6 +50,10 @@ typedef struct encoding {
const char *,
const char *,
const char **);
+ int (*sameName)(const struct encoding *,
+ const char *, const char *);
+ int (*getAtts)(const struct encoding *enc, const char *ptr,
+ int attsMax, const char **atts);
int minBytesPerChar;
} ENCODING;
@@ -79,6 +87,11 @@ literals, comments and processing instructions.
#define XmlContentTok(enc, ptr, end, nextTokPtr) \
XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
+#define XmlSameName(enc, ptr1, ptr2) (((enc)->sameName)(enc, ptr1, ptr2))
+
+#define XmlGetAttributes(enc, ptr, attsMax, atts) \
+ (((enc)->getAtts)(enc, ptr, attsMax, atts))
+
typedef struct {
ENCODING initEnc;
const ENCODING **encPtr;
diff --git a/expat/xmltok/xmltok_impl.c b/expat/xmltok/xmltok_impl.c
index efb0f380..a71d76c4 100755
--- a/expat/xmltok/xmltok_impl.c
+++ b/expat/xmltok/xmltok_impl.c
@@ -453,7 +453,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
continue;
case BT_GT:
*nextTokPtr = ptr + MINBPC;
- return XML_TOK_START_TAG;
+ return XML_TOK_START_TAG_WITH_ATTS;
case BT_SOL:
ptr += MINBPC;
if (ptr == end)
@@ -463,7 +463,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
return XML_TOK_INVALID;
}
*nextTokPtr = ptr + MINBPC;
- return XML_TOK_EMPTY_ELEMENT;
+ return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
@@ -537,7 +537,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
case BT_GT:
gt:
*nextTokPtr = ptr + MINBPC;
- return XML_TOK_START_TAG;
+ return XML_TOK_START_TAG_NO_ATTS;
case BT_SOL:
sol:
ptr += MINBPC;
@@ -548,7 +548,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
return XML_TOK_INVALID;
}
*nextTokPtr = ptr + MINBPC;
- return XML_TOK_EMPTY_ELEMENT;
+ return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
@@ -728,6 +728,120 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
return XML_TOK_PROLOG_CHARS;
}
+/* This must only be called for a well-formed start-tag or empty element tag.
+Returns the number of attributes. Pointers to the names of up to the first
+attsMax attributes are stored in atts. */
+static
+int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
+ int attsMax, const char **atts)
+{
+ enum { other, inName, inValue } state = inName;
+ int nAtts = 0;
+ int open;
+
+ for (ptr += MINBPC;; ptr += MINBPC) {
+ switch (BYTE_TYPE(enc, ptr)) {
+#define START_NAME \
+ if (state == other) { \
+ if (nAtts < attsMax) \
+ atts[nAtts] = ptr; \
+ ++nAtts; \
+ state = inName; \
+ }
+#define LEAD_CASE(n) \
+ case BT_LEAD ## n: START_NAME ptr += (n - MINBPC); break;
+ LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) LEAD_CASE(5) LEAD_CASE(6)
+#undef LEAD_CASE
+ case BT_NONASCII:
+ case BT_NMSTRT:
+ case BT_HEX:
+ START_NAME
+ break;
+#undef START_NAME
+ case BT_QUOT:
+ if (state == other) {
+ state = inValue;
+ open = BT_QUOT;
+ }
+ else if (open == BT_QUOT)
+ state = other;
+ break;
+ case BT_APOS:
+ if (state == other) {
+ state = inValue;
+ open = BT_APOS;
+ }
+ else if (open == BT_APOS)
+ state = other;
+ break;
+ case BT_S:
+ /* This case ensures that the first attribute name is counted
+ Apart from that we could just change state on the quote. */
+ if (state == inName)
+ state = other;
+ break;
+ case BT_GT:
+ case BT_SOL:
+ if (state != inValue)
+ return nAtts;
+ break;
+ default:
+ break;
+ }
+ }
+ /* not reached */
+}
+
+static
+int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
+{
+ for (;;) {
+ switch (BYTE_TYPE(enc, ptr1)) {
+#define LEAD_CASE(n) \
+ case BT_LEAD ## n: \
+ if (*ptr1++ != *ptr2++) \
+ return 0;
+ LEAD_CASE(6) LEAD_CASE(5) LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
+#undef LEAD_CASE
+ /* fall through */
+ if (*ptr1++ != *ptr2++)
+ return 0;
+ break;
+ case BT_NONASCII:
+ case BT_NMSTRT:
+ case BT_HEX:
+ case BT_DIGIT:
+ case BT_NAME:
+ case BT_MINUS:
+ if (*ptr2 != *ptr1)
+ return 0;
+ ptr1 += MINBPC;
+ ptr2 += MINBPC;
+ break;
+ default:
+ if (*ptr1 == *ptr2)
+ return 1;
+ switch (BYTE_TYPE(enc, ptr2)) {
+ case BT_LEAD2:
+ case BT_LEAD3:
+ case BT_LEAD4:
+ case BT_LEAD5:
+ case BT_LEAD6:
+ case BT_NONASCII:
+ case BT_NMSTRT:
+ case BT_HEX:
+ case BT_DIGIT:
+ case BT_NAME:
+ case BT_MINUS:
+ return 0;
+ default:
+ return 1;
+ }
+ }
+ }
+ /* not reached */
+}
+
#undef DO_LEAD_CASE
#undef MULTIBYTE_CASES
#undef INVALID_CASES