Add methods for getting attributes and comparing names.

2025-04-05 05:05:00 +00:00 · 1997-11-11 05:53:20 +00:00 · 1997-11-11 05:53:20 +00:00 · 69f0244382
commit 69f0244382
parent 1197596d3b
3 changed files with 158 additions and 22 deletions
--- a/expat/xmltok/xmltok.c
+++ b/expat/xmltok/xmltok.c
@ -1,5 +1,12 @@
 /* TODO

+method to get name length
+method to extract attribute names (returns number of atts)
+size_t getAttributes(const char *ptr, const char *end, const char **atts, size_t maxAtts)
+
+
+Provide method to count lines/columns.
+
 Provide methods to convert to any of UTF-8, UTF-18, UCS-4.

 Better prolog tokenization
@ -9,6 +16,8 @@ NMTOKEN
 NAME
 PEREF

+MatchEndTag(endTagStart, endTagEnd, startTagPtr, startTagEnd)
+
 */

 #ifdef _MSC_VER
@ -78,7 +87,7 @@ struct normal_encoding {
 #undef IS_NMSTRT_CHAR

 const struct normal_encoding utf8_encoding = {
-  { { PREFIX(prologTok), PREFIX(contentTok) }, 1 },
+  { { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 1 },
 #include "asciitab.h"
 #include "utf8tab.h"
 };
@ -127,7 +136,7 @@ static int unicode_byte_type(char hi, char lo)
 #undef IS_NMSTRT_CHAR

 const struct encoding little2_encoding = {
- { PREFIX(prologTok), PREFIX(contentTok) }, 2
+ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
 };

 #undef PREFIX
@ -152,7 +161,7 @@ const struct encoding little2_encoding = {
 #undef IS_NMSTRT_CHAR

 const struct encoding big2_encoding = {
- { PREFIX(prologTok), PREFIX(contentTok) }, 2
+ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
 };

 #undef PREFIX
--- a/expat/xmltok/xmltok.h
+++ b/expat/xmltok/xmltok.h
@ -14,28 +14,32 @@ extern "C" {
 #define XML_TOK_PARTIAL_CHAR -2 /* only part of a multibyte sequence */
 #define XML_TOK_PARTIAL -1 /* only part of a token */
 #define XML_TOK_INVALID 0
-#define XML_TOK_BOM 1     /* Byte order mark */
-#define XML_TOK_COMMENT 2
-#define XML_TOK_PI 3      /* processing instruction */
-
-/* The following tokens are returned only by XmlPrologTok */
-#define XML_TOK_LITERAL 4
-#define XML_TOK_PROLOG_CHARS 5
-#define XML_TOK_PROLOG_S 6

 /* The following token is returned by XmlPrologTok when it detects the end
 of the prolog and is also returned by XmlContentTok */

-#define XML_TOK_START_TAG 7
+#define XML_TOK_START_TAG_WITH_ATTS 1
+#define XML_TOK_START_TAG_NO_ATTS 2
+#define XML_TOK_EMPTY_ELEMENT_WITH_ATTS 3 /* empty element tag <e/> */
+#define XML_TOK_EMPTY_ELEMENT_NO_ATTS 4

 /* The following tokens are returned only by XmlContentTok */

-#define XML_TOK_END_TAG 8
-#define XML_TOK_EMPTY_ELEMENT 9 /* empty element tag <e/> */
-#define XML_TOK_DATA_CHARS 10
-#define XML_TOK_CDATA_SECTION 11
-#define XML_TOK_ENTITY_REF 12
-#define XML_TOK_CHAR_REF 13     /* numeric character reference */
+#define XML_TOK_END_TAG 5
+#define XML_TOK_DATA_CHARS 6
+#define XML_TOK_CDATA_SECTION 7
+#define XML_TOK_ENTITY_REF 8
+#define XML_TOK_CHAR_REF 9     /* numeric character reference */
+
+/* The following tokens may be returned by both XmlPrologTok and XmlContentTok */
+#define XML_TOK_PI 10      /* processing instruction */
+#define XML_TOK_COMMENT 11
+#define XML_TOK_BOM 12     /* Byte order mark */
+
+/* The following tokens are returned only by XmlPrologTok */
+#define XML_TOK_LITERAL 13
+#define XML_TOK_PROLOG_CHARS 14
+#define XML_TOK_PROLOG_S 15

 #define XML_NSTATES 2
 #define XML_PROLOG_STATE 0
@ -46,6 +50,10 @@ typedef struct encoding {
 			       const char *,
 			       const char *,
 			       const char **);
+  int (*sameName)(const struct encoding *,
+	          const char *, const char *);
+  int (*getAtts)(const struct encoding *enc, const char *ptr,
+	         int attsMax, const char **atts);
  int minBytesPerChar;
 } ENCODING;

@ -79,6 +87,11 @@ literals, comments and processing instructions.
 #define XmlContentTok(enc, ptr, end, nextTokPtr) \
   XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)

+#define XmlSameName(enc, ptr1, ptr2) (((enc)->sameName)(enc, ptr1, ptr2))
+
+#define XmlGetAttributes(enc, ptr, attsMax, atts) \
+  (((enc)->getAtts)(enc, ptr, attsMax, atts))
+
 typedef struct {
  ENCODING initEnc;
  const ENCODING **encPtr;
--- a/expat/xmltok/xmltok_impl.c
+++ b/expat/xmltok/xmltok_impl.c
@ -453,7 +453,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
 	    continue;
 	  case BT_GT:
 	    *nextTokPtr = ptr + MINBPC;
-	    return XML_TOK_START_TAG;
+	    return XML_TOK_START_TAG_WITH_ATTS;
 	  case BT_SOL:
 	    ptr += MINBPC;
 	    if (ptr == end)
@ -463,7 +463,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
 	      return XML_TOK_INVALID;
 	    }
 	    *nextTokPtr = ptr + MINBPC;
-	    return XML_TOK_EMPTY_ELEMENT;
+	    return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
 	  default:
 	    *nextTokPtr = ptr;
 	    return XML_TOK_INVALID;
@ -537,7 +537,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
    case BT_GT:
    gt:
      *nextTokPtr = ptr + MINBPC;
-      return XML_TOK_START_TAG;
+      return XML_TOK_START_TAG_NO_ATTS;
    case BT_SOL:
    sol:
      ptr += MINBPC;
@ -548,7 +548,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
 	return XML_TOK_INVALID;
      }
      *nextTokPtr = ptr + MINBPC;
-      return XML_TOK_EMPTY_ELEMENT;
+      return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
    default:
      *nextTokPtr = ptr;
      return XML_TOK_INVALID;
@ -728,6 +728,120 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
  return XML_TOK_PROLOG_CHARS;
 }

+/* This must only be called for a well-formed start-tag or empty element tag.
+Returns the number of attributes.  Pointers to the names of up to the first
+attsMax attributes are stored in atts. */
+static
+int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
+		    int attsMax, const char **atts)
+{
+  enum { other, inName, inValue } state = inName;
+  int nAtts = 0;
+  int open;
+
+  for (ptr += MINBPC;; ptr += MINBPC) {
+    switch (BYTE_TYPE(enc, ptr)) {
+#define START_NAME \
+      if (state == other) { \
+	if (nAtts < attsMax) \
+	  atts[nAtts] = ptr; \
+	++nAtts; \
+	state = inName; \
+      }
+#define LEAD_CASE(n) \
+    case BT_LEAD ## n: START_NAME ptr += (n - MINBPC); break;
+    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) LEAD_CASE(5) LEAD_CASE(6)
+#undef LEAD_CASE
+    case BT_NONASCII:
+    case BT_NMSTRT:
+    case BT_HEX:
+      START_NAME
+      break;
+#undef START_NAME
+    case BT_QUOT:
+      if (state == other) {
+        state = inValue;
+        open = BT_QUOT;
+      }
+      else if (open == BT_QUOT)
+        state = other;
+      break;
+    case BT_APOS:
+      if (state == other) {
+        state = inValue;
+        open = BT_APOS;
+      }
+      else if (open == BT_APOS)
+        state = other;
+      break;
+    case BT_S:
+      /* This case ensures that the first attribute name is counted
+         Apart from that we could just change state on the quote. */
+      if (state == inName)
+        state = other;
+      break;
+    case BT_GT:
+    case BT_SOL:
+      if (state != inValue)
+	return nAtts;
+      break;
+    default:
+      break;
+    }
+  }
+  /* not reached */
+}
+
+static
+int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
+{
+  for (;;) {
+    switch (BYTE_TYPE(enc, ptr1)) {
+#define LEAD_CASE(n) \
+    case BT_LEAD ## n: \
+      if (*ptr1++ != *ptr2++) \
+	return 0;
+    LEAD_CASE(6) LEAD_CASE(5) LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
+#undef LEAD_CASE
+      /* fall through */
+      if (*ptr1++ != *ptr2++)
+	return 0;
+      break;
+    case BT_NONASCII:
+    case BT_NMSTRT:
+    case BT_HEX:
+    case BT_DIGIT:
+    case BT_NAME:
+    case BT_MINUS:
+      if (*ptr2 != *ptr1)
+	return 0;
+      ptr1 += MINBPC;
+      ptr2 += MINBPC;
+      break;
+    default:
+      if (*ptr1 == *ptr2)
+	return 1;
+      switch (BYTE_TYPE(enc, ptr2)) {
+      case BT_LEAD2:
+      case BT_LEAD3:
+      case BT_LEAD4:
+      case BT_LEAD5:
+      case BT_LEAD6:
+      case BT_NONASCII:
+      case BT_NMSTRT:
+      case BT_HEX:
+      case BT_DIGIT:
+      case BT_NAME:
+      case BT_MINUS:
+	return 0;
+      default:
+	return 1;
+      }
+    }
+  }
+  /* not reached */
+}
+
 #undef DO_LEAD_CASE
 #undef MULTIBYTE_CASES
 #undef INVALID_CASES