From 69f024438207f34c1f901bd525e1fb25b7cbf19a Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 11 Nov 1997 05:53:20 +0000 Subject: [PATCH] Add methods for getting attributes and comparing names. --- expat/xmltok/xmltok.c | 15 ++++- expat/xmltok/xmltok.h | 43 ++++++++----- expat/xmltok/xmltok_impl.c | 122 +++++++++++++++++++++++++++++++++++-- 3 files changed, 158 insertions(+), 22 deletions(-) diff --git a/expat/xmltok/xmltok.c b/expat/xmltok/xmltok.c index f1a0d351..b0693c85 100755 --- a/expat/xmltok/xmltok.c +++ b/expat/xmltok/xmltok.c @@ -1,5 +1,12 @@ /* TODO +method to get name length +method to extract attribute names (returns number of atts) +size_t getAttributes(const char *ptr, const char *end, const char **atts, size_t maxAtts) + + +Provide method to count lines/columns. + Provide methods to convert to any of UTF-8, UTF-18, UCS-4. Better prolog tokenization @@ -9,6 +16,8 @@ NMTOKEN NAME PEREF +MatchEndTag(endTagStart, endTagEnd, startTagPtr, startTagEnd) + */ #ifdef _MSC_VER @@ -78,7 +87,7 @@ struct normal_encoding { #undef IS_NMSTRT_CHAR const struct normal_encoding utf8_encoding = { - { { PREFIX(prologTok), PREFIX(contentTok) }, 1 }, + { { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 1 }, #include "asciitab.h" #include "utf8tab.h" }; @@ -127,7 +136,7 @@ static int unicode_byte_type(char hi, char lo) #undef IS_NMSTRT_CHAR const struct encoding little2_encoding = { - { PREFIX(prologTok), PREFIX(contentTok) }, 2 + { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2 }; #undef PREFIX @@ -152,7 +161,7 @@ const struct encoding little2_encoding = { #undef IS_NMSTRT_CHAR const struct encoding big2_encoding = { - { PREFIX(prologTok), PREFIX(contentTok) }, 2 + { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2 }; #undef PREFIX diff --git a/expat/xmltok/xmltok.h b/expat/xmltok/xmltok.h index 511251e7..3583e068 100755 --- a/expat/xmltok/xmltok.h +++ b/expat/xmltok/xmltok.h @@ -14,28 +14,32 @@ extern "C" { #define XML_TOK_PARTIAL_CHAR -2 /* only part of a multibyte sequence */ #define XML_TOK_PARTIAL -1 /* only part of a token */ #define XML_TOK_INVALID 0 -#define XML_TOK_BOM 1 /* Byte order mark */ -#define XML_TOK_COMMENT 2 -#define XML_TOK_PI 3 /* processing instruction */ - -/* The following tokens are returned only by XmlPrologTok */ -#define XML_TOK_LITERAL 4 -#define XML_TOK_PROLOG_CHARS 5 -#define XML_TOK_PROLOG_S 6 /* The following token is returned by XmlPrologTok when it detects the end of the prolog and is also returned by XmlContentTok */ -#define XML_TOK_START_TAG 7 +#define XML_TOK_START_TAG_WITH_ATTS 1 +#define XML_TOK_START_TAG_NO_ATTS 2 +#define XML_TOK_EMPTY_ELEMENT_WITH_ATTS 3 /* empty element tag */ +#define XML_TOK_EMPTY_ELEMENT_NO_ATTS 4 /* The following tokens are returned only by XmlContentTok */ -#define XML_TOK_END_TAG 8 -#define XML_TOK_EMPTY_ELEMENT 9 /* empty element tag */ -#define XML_TOK_DATA_CHARS 10 -#define XML_TOK_CDATA_SECTION 11 -#define XML_TOK_ENTITY_REF 12 -#define XML_TOK_CHAR_REF 13 /* numeric character reference */ +#define XML_TOK_END_TAG 5 +#define XML_TOK_DATA_CHARS 6 +#define XML_TOK_CDATA_SECTION 7 +#define XML_TOK_ENTITY_REF 8 +#define XML_TOK_CHAR_REF 9 /* numeric character reference */ + +/* The following tokens may be returned by both XmlPrologTok and XmlContentTok */ +#define XML_TOK_PI 10 /* processing instruction */ +#define XML_TOK_COMMENT 11 +#define XML_TOK_BOM 12 /* Byte order mark */ + +/* The following tokens are returned only by XmlPrologTok */ +#define XML_TOK_LITERAL 13 +#define XML_TOK_PROLOG_CHARS 14 +#define XML_TOK_PROLOG_S 15 #define XML_NSTATES 2 #define XML_PROLOG_STATE 0 @@ -46,6 +50,10 @@ typedef struct encoding { const char *, const char *, const char **); + int (*sameName)(const struct encoding *, + const char *, const char *); + int (*getAtts)(const struct encoding *enc, const char *ptr, + int attsMax, const char **atts); int minBytesPerChar; } ENCODING; @@ -79,6 +87,11 @@ literals, comments and processing instructions. #define XmlContentTok(enc, ptr, end, nextTokPtr) \ XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr) +#define XmlSameName(enc, ptr1, ptr2) (((enc)->sameName)(enc, ptr1, ptr2)) + +#define XmlGetAttributes(enc, ptr, attsMax, atts) \ + (((enc)->getAtts)(enc, ptr, attsMax, atts)) + typedef struct { ENCODING initEnc; const ENCODING **encPtr; diff --git a/expat/xmltok/xmltok_impl.c b/expat/xmltok/xmltok_impl.c index efb0f380..a71d76c4 100755 --- a/expat/xmltok/xmltok_impl.c +++ b/expat/xmltok/xmltok_impl.c @@ -453,7 +453,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, continue; case BT_GT: *nextTokPtr = ptr + MINBPC; - return XML_TOK_START_TAG; + return XML_TOK_START_TAG_WITH_ATTS; case BT_SOL: ptr += MINBPC; if (ptr == end) @@ -463,7 +463,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, return XML_TOK_INVALID; } *nextTokPtr = ptr + MINBPC; - return XML_TOK_EMPTY_ELEMENT; + return XML_TOK_EMPTY_ELEMENT_WITH_ATTS; default: *nextTokPtr = ptr; return XML_TOK_INVALID; @@ -537,7 +537,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, case BT_GT: gt: *nextTokPtr = ptr + MINBPC; - return XML_TOK_START_TAG; + return XML_TOK_START_TAG_NO_ATTS; case BT_SOL: sol: ptr += MINBPC; @@ -548,7 +548,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, return XML_TOK_INVALID; } *nextTokPtr = ptr + MINBPC; - return XML_TOK_EMPTY_ELEMENT; + return XML_TOK_EMPTY_ELEMENT_NO_ATTS; default: *nextTokPtr = ptr; return XML_TOK_INVALID; @@ -728,6 +728,120 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, return XML_TOK_PROLOG_CHARS; } +/* This must only be called for a well-formed start-tag or empty element tag. +Returns the number of attributes. Pointers to the names of up to the first +attsMax attributes are stored in atts. */ +static +int PREFIX(getAtts)(const ENCODING *enc, const char *ptr, + int attsMax, const char **atts) +{ + enum { other, inName, inValue } state = inName; + int nAtts = 0; + int open; + + for (ptr += MINBPC;; ptr += MINBPC) { + switch (BYTE_TYPE(enc, ptr)) { +#define START_NAME \ + if (state == other) { \ + if (nAtts < attsMax) \ + atts[nAtts] = ptr; \ + ++nAtts; \ + state = inName; \ + } +#define LEAD_CASE(n) \ + case BT_LEAD ## n: START_NAME ptr += (n - MINBPC); break; + LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) LEAD_CASE(5) LEAD_CASE(6) +#undef LEAD_CASE + case BT_NONASCII: + case BT_NMSTRT: + case BT_HEX: + START_NAME + break; +#undef START_NAME + case BT_QUOT: + if (state == other) { + state = inValue; + open = BT_QUOT; + } + else if (open == BT_QUOT) + state = other; + break; + case BT_APOS: + if (state == other) { + state = inValue; + open = BT_APOS; + } + else if (open == BT_APOS) + state = other; + break; + case BT_S: + /* This case ensures that the first attribute name is counted + Apart from that we could just change state on the quote. */ + if (state == inName) + state = other; + break; + case BT_GT: + case BT_SOL: + if (state != inValue) + return nAtts; + break; + default: + break; + } + } + /* not reached */ +} + +static +int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2) +{ + for (;;) { + switch (BYTE_TYPE(enc, ptr1)) { +#define LEAD_CASE(n) \ + case BT_LEAD ## n: \ + if (*ptr1++ != *ptr2++) \ + return 0; + LEAD_CASE(6) LEAD_CASE(5) LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2) +#undef LEAD_CASE + /* fall through */ + if (*ptr1++ != *ptr2++) + return 0; + break; + case BT_NONASCII: + case BT_NMSTRT: + case BT_HEX: + case BT_DIGIT: + case BT_NAME: + case BT_MINUS: + if (*ptr2 != *ptr1) + return 0; + ptr1 += MINBPC; + ptr2 += MINBPC; + break; + default: + if (*ptr1 == *ptr2) + return 1; + switch (BYTE_TYPE(enc, ptr2)) { + case BT_LEAD2: + case BT_LEAD3: + case BT_LEAD4: + case BT_LEAD5: + case BT_LEAD6: + case BT_NONASCII: + case BT_NMSTRT: + case BT_HEX: + case BT_DIGIT: + case BT_NAME: + case BT_MINUS: + return 0; + default: + return 1; + } + } + } + /* not reached */ +} + #undef DO_LEAD_CASE #undef MULTIBYTE_CASES #undef INVALID_CASES