xmltok: Add more in-code documentation about byte types

This commit is contained in:
Sebastian Pipping 2019-08-27 18:46:00 +02:00
parent e3d6578214
commit 748ac8799d
2 changed files with 39 additions and 37 deletions

View file

@ -587,11 +587,13 @@ static const struct normal_encoding ascii_encoding
static int PTRFASTCALL
unicode_byte_type(char hi, char lo) {
switch ((unsigned char)hi) {
/* 0xD8000xDBFF first 16-bit code unit or high surrogate (W1) */
case 0xD8:
case 0xD9:
case 0xDA:
case 0xDB:
return BT_LEAD4;
/* 0xDC000xDFFF second 16-bit code unit or low surrogate (W2) */
case 0xDC:
case 0xDD:
case 0xDE:
@ -599,8 +601,8 @@ unicode_byte_type(char hi, char lo) {
return BT_TRAIL;
case 0xFF:
switch ((unsigned char)lo) {
case 0xFF:
case 0xFE:
case 0xFF: /* noncharacter-FFFF */
case 0xFE: /* noncharacter-FFFE */
return BT_NONXML;
}
break;

View file

@ -31,43 +31,43 @@
*/
enum {
BT_NONXML,
BT_MALFORM,
BT_LT,
BT_AMP,
BT_RSQB,
BT_LEAD2,
BT_LEAD3,
BT_LEAD4,
BT_TRAIL,
BT_CR,
BT_LF,
BT_GT,
BT_QUOT,
BT_APOS,
BT_EQUALS,
BT_QUEST,
BT_EXCL,
BT_SOL,
BT_SEMI,
BT_NUM,
BT_LSQB,
BT_S,
BT_NMSTRT,
BT_COLON,
BT_HEX,
BT_DIGIT,
BT_NAME,
BT_MINUS,
BT_NONXML, /* e.g. noncharacter-FFFF */
BT_MALFORM, /* illegal, with regard to encoding */
BT_LT, /* less than = "<" */
BT_AMP, /* ampersand = "&" */
BT_RSQB, /* right square bracket = "[" */
BT_LEAD2, /* lead byte of a 2-byte UTF-8 character */
BT_LEAD3, /* lead byte of a 3-byte UTF-8 character */
BT_LEAD4, /* lead byte of a 4-byte UTF-8 character */
BT_TRAIL, /* trailing unit, e.g. second 16-bit unit of a 4-byte char. */
BT_CR, /* carriage return = "\r" */
BT_LF, /* line feed = "\n" */
BT_GT, /* greater than = ">" */
BT_QUOT, /* quotation character = "\"" */
BT_APOS, /* aposthrophe = "'" */
BT_EQUALS, /* equal sign = "=" */
BT_QUEST, /* question mark = "?" */
BT_EXCL, /* exclamation mark = "!" */
BT_SOL, /* solidus, slash = "/" */
BT_SEMI, /* semicolon = ";" */
BT_NUM, /* number sign = "#" */
BT_LSQB, /* left square bracket = "[" */
BT_S, /* white space, e.g. "\t", " "[, "\r"] */
BT_NMSTRT, /* non-hex name start letter = "G".."Z" + "g".."z" + "_" */
BT_COLON, /* colon = ":" */
BT_HEX, /* hex letter = "A".."F" + "a".."f" */
BT_DIGIT, /* digit = "0".."9" */
BT_NAME, /* dot and middle dot = "." + chr(0xb7) */
BT_MINUS, /* minus = "-" */
BT_OTHER, /* known not to be a name or name start character */
BT_NONASCII, /* might be a name or name start character */
BT_PERCNT,
BT_LPAR,
BT_RPAR,
BT_AST,
BT_PLUS,
BT_COMMA,
BT_VERBAR
BT_PERCNT, /* percent sign = "%" */
BT_LPAR, /* left parenthesis = "(" */
BT_RPAR, /* right parenthesis = "(" */
BT_AST, /* asterisk = "*" */
BT_PLUS, /* plus sign = "+" */
BT_COMMA, /* comma = "," */
BT_VERBAR /* vertical bar = "|" */
};
#include <stddef.h>