Move more encoding tests out of runtests.c

This commit is contained in:
Rhodri James 2022-10-28 22:04:25 +01:00
parent 6933957fb6
commit eef3e9f4c2
4 changed files with 224 additions and 218 deletions

View file

@ -4232,6 +4232,183 @@ START_TEST(test_ext_entity_utf16_le) {
}
END_TEST
/* Test little-endian UTF-16 given no explicit encoding.
* The existing default encoding (UTF-8) is assumed to hold without a
* BOM to contradict it, so the entity value will in fact provoke an
* error because 0x00 is not a valid XML character. We parse the
* whole buffer in one go rather than feeding it in byte by byte to
* exercise different code paths in the initial scanning routines.
*/
START_TEST(test_ext_entity_utf16_unknown) {
const char *text = "<!DOCTYPE doc [\n"
" <!ENTITY en SYSTEM 'http://example.org/dummy.ent'>\n"
"]>\n"
"<doc>&en;</doc>";
ExtFaults2 test_data
= {"a\0b\0c\0", 6, "Invalid character in entity not faulted", NULL,
XML_ERROR_INVALID_TOKEN};
XML_SetExternalEntityRefHandler(g_parser, external_entity_faulter2);
XML_SetUserData(g_parser, &test_data);
expect_failure(text, XML_ERROR_EXTERNAL_ENTITY_HANDLING,
"Invalid character should not have been accepted");
}
END_TEST
/* Test not-quite-UTF-8 BOM (0xEF 0xBB 0xBF) */
START_TEST(test_ext_entity_utf8_non_bom) {
const char *text = "<!DOCTYPE doc [\n"
" <!ENTITY en SYSTEM 'http://example.org/dummy.ent'>\n"
"]>\n"
"<doc>&en;</doc>";
ExtTest2 test_data
= {"\xef\xbb\x80", /* Arabic letter DAD medial form, U+FEC0 */
3, NULL, NULL, EE_PARSE_NONE};
#ifdef XML_UNICODE
const XML_Char *expected = XCS("\xfec0");
#else
const XML_Char *expected = XCS("\xef\xbb\x80");
#endif
CharData storage;
CharData_Init(&storage);
test_data.storage = &storage;
XML_SetExternalEntityRefHandler(g_parser, external_entity_loader2);
XML_SetUserData(g_parser, &test_data);
XML_SetCharacterDataHandler(g_parser, ext2_accumulate_characters);
if (_XML_Parse_SINGLE_BYTES(g_parser, text, (int)strlen(text), XML_TRUE)
== XML_STATUS_ERROR)
xml_failure(g_parser);
CharData_CheckXMLChars(&storage, expected);
}
END_TEST
/* Test that UTF-8 in a CDATA section is correctly passed through */
START_TEST(test_utf8_in_cdata_section) {
const char *text = "<doc><![CDATA[one \xc3\xa9 two]]></doc>";
#ifdef XML_UNICODE
const XML_Char *expected = XCS("one \x00e9 two");
#else
const XML_Char *expected = XCS("one \xc3\xa9 two");
#endif
run_character_check(text, expected);
}
END_TEST
/* Test that little-endian UTF-16 in a CDATA section is handled */
START_TEST(test_utf8_in_cdata_section_2) {
const char *text = "<doc><![CDATA[\xc3\xa9]\xc3\xa9two]]></doc>";
#ifdef XML_UNICODE
const XML_Char *expected = XCS("\x00e9]\x00e9two");
#else
const XML_Char *expected = XCS("\xc3\xa9]\xc3\xa9two");
#endif
run_character_check(text, expected);
}
END_TEST
START_TEST(test_utf8_in_start_tags) {
struct test_case {
bool goodName;
bool goodNameStart;
const char *tagName;
};
// The idea with the tests below is this:
// We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
// go to isNever and are hence not a concern.
//
// We start with a character that is a valid name character
// (or even name-start character, see XML 1.0r4 spec) and then we flip
// single bits at places where (1) the result leaves the UTF-8 encoding space
// and (2) we stay in the same n-byte sequence family.
//
// The flipped bits are highlighted in angle brackets in comments,
// e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
// the most significant bit to 1 to leave UTF-8 encoding space.
struct test_case cases[] = {
// 1-byte UTF-8: [0xxx xxxx]
{true, true, "\x3A"}, // [0011 1010] = ASCII colon ':'
{false, false, "\xBA"}, // [<1>011 1010]
{true, false, "\x39"}, // [0011 1001] = ASCII nine '9'
{false, false, "\xB9"}, // [<1>011 1001]
// 2-byte UTF-8: [110x xxxx] [10xx xxxx]
{true, true, "\xDB\xA5"}, // [1101 1011] [1010 0101] =
// Arabic small waw U+06E5
{false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
{false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
{false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
{true, false, "\xCC\x81"}, // [1100 1100] [1000 0001] =
// combining char U+0301
{false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
{false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
{false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
// 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
{true, true, "\xE0\xA4\x85"}, // [1110 0000] [1010 0100] [1000 0101] =
// Devanagari Letter A U+0905
{false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
{false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
{false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
{false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
{false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
{true, false, "\xE0\xA4\x81"}, // [1110 0000] [1010 0100] [1000 0001] =
// combining char U+0901
{false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
{false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
{false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
{false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
{false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
};
const bool atNameStart[] = {true, false};
size_t i = 0;
char doc[1024];
size_t failCount = 0;
for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
size_t j = 0;
for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
const bool expectedSuccess
= atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
snprintf(doc, sizeof(doc), "<%s%s><!--", atNameStart[j] ? "" : "a",
cases[i].tagName);
XML_Parser parser = XML_ParserCreate(NULL);
const enum XML_Status status
= XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE);
bool success = true;
if ((status == XML_STATUS_OK) != expectedSuccess) {
success = false;
}
if ((status == XML_STATUS_ERROR)
&& (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
success = false;
}
if (! success) {
fprintf(
stderr,
"FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
(unsigned)i + 1u, atNameStart[j] ? " " : "not ",
(unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
failCount++;
}
XML_ParserFree(parser);
}
}
if (failCount > 0) {
fail("UTF-8 regression detected");
}
}
END_TEST
TCase *
make_basic_test_case(Suite *s) {
TCase *tc_basic = tcase_create("basic tests");
@ -4432,6 +4609,11 @@ make_basic_test_case(Suite *s) {
tcase_add_test(tc_basic, test_ext_entity_latin1_utf16be_bom2);
tcase_add_test(tc_basic, test_ext_entity_utf16_be);
tcase_add_test(tc_basic, test_ext_entity_utf16_le);
tcase_add_test(tc_basic, test_ext_entity_utf16_unknown);
tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom);
tcase_add_test(tc_basic, test_utf8_in_cdata_section);
tcase_add_test(tc_basic, test_utf8_in_cdata_section_2);
tcase_add_test(tc_basic, test_utf8_in_start_tags);
return tc_basic; /* TEMPORARY: this will become a void function */
}

View file

@ -1043,6 +1043,34 @@ external_entity_loader2(XML_Parser parser, const XML_Char *context,
return XML_STATUS_OK;
}
int XMLCALL
external_entity_faulter2(XML_Parser parser, const XML_Char *context,
const XML_Char *base, const XML_Char *systemId,
const XML_Char *publicId) {
ExtFaults2 *test_data = (ExtFaults2 *)XML_GetUserData(parser);
XML_Parser extparser;
UNUSED_P(base);
UNUSED_P(systemId);
UNUSED_P(publicId);
extparser = XML_ExternalEntityParserCreate(parser, context, NULL);
if (extparser == NULL)
fail("Could not create external entity parser");
if (test_data->encoding != NULL) {
if (! XML_SetEncoding(extparser, test_data->encoding))
fail("XML_SetEncoding() ignored for external entity");
}
if (XML_Parse(extparser, test_data->parse_text, test_data->parse_len,
XML_TRUE)
!= XML_STATUS_ERROR)
fail(test_data->fail_text);
if (XML_GetErrorCode(extparser) != test_data->error)
xml_failure(extparser);
XML_ParserFree(extparser);
return XML_STATUS_ERROR;
}
/* NotStandalone handlers */
int XMLCALL

View file

@ -302,6 +302,20 @@ extern int XMLCALL external_entity_loader2(XML_Parser parser,
const XML_Char *systemId,
const XML_Char *publicId);
typedef struct ExtFaults2 {
const char *parse_text;
int parse_len;
const char *fail_text;
const XML_Char *encoding;
enum XML_Error error;
} ExtFaults2;
extern int XMLCALL external_entity_faulter2(XML_Parser parser,
const XML_Char *context,
const XML_Char *base,
const XML_Char *systemId,
const XML_Char *publicId);
/* NotStandalone handlers */
extern int XMLCALL reject_not_standalone_handler(void *userData);

View file

@ -74,219 +74,6 @@
XML_Parser g_parser = NULL;
/* Test little-endian UTF-16 given no explicit encoding.
* The existing default encoding (UTF-8) is assumed to hold without a
* BOM to contradict it, so the entity value will in fact provoke an
* error because 0x00 is not a valid XML character. We parse the
* whole buffer in one go rather than feeding it in byte by byte to
* exercise different code paths in the initial scanning routines.
*/
typedef struct ExtFaults2 {
const char *parse_text;
int parse_len;
const char *fail_text;
const XML_Char *encoding;
enum XML_Error error;
} ExtFaults2;
static int XMLCALL
external_entity_faulter2(XML_Parser parser, const XML_Char *context,
const XML_Char *base, const XML_Char *systemId,
const XML_Char *publicId) {
ExtFaults2 *test_data = (ExtFaults2 *)XML_GetUserData(parser);
XML_Parser extparser;
UNUSED_P(base);
UNUSED_P(systemId);
UNUSED_P(publicId);
extparser = XML_ExternalEntityParserCreate(parser, context, NULL);
if (extparser == NULL)
fail("Could not create external entity parser");
if (test_data->encoding != NULL) {
if (! XML_SetEncoding(extparser, test_data->encoding))
fail("XML_SetEncoding() ignored for external entity");
}
if (XML_Parse(extparser, test_data->parse_text, test_data->parse_len,
XML_TRUE)
!= XML_STATUS_ERROR)
fail(test_data->fail_text);
if (XML_GetErrorCode(extparser) != test_data->error)
xml_failure(extparser);
XML_ParserFree(extparser);
return XML_STATUS_ERROR;
}
START_TEST(test_ext_entity_utf16_unknown) {
const char *text = "<!DOCTYPE doc [\n"
" <!ENTITY en SYSTEM 'http://example.org/dummy.ent'>\n"
"]>\n"
"<doc>&en;</doc>";
ExtFaults2 test_data
= {"a\0b\0c\0", 6, "Invalid character in entity not faulted", NULL,
XML_ERROR_INVALID_TOKEN};
XML_SetExternalEntityRefHandler(g_parser, external_entity_faulter2);
XML_SetUserData(g_parser, &test_data);
expect_failure(text, XML_ERROR_EXTERNAL_ENTITY_HANDLING,
"Invalid character should not have been accepted");
}
END_TEST
/* Test not-quite-UTF-8 BOM (0xEF 0xBB 0xBF) */
START_TEST(test_ext_entity_utf8_non_bom) {
const char *text = "<!DOCTYPE doc [\n"
" <!ENTITY en SYSTEM 'http://example.org/dummy.ent'>\n"
"]>\n"
"<doc>&en;</doc>";
ExtTest2 test_data
= {"\xef\xbb\x80", /* Arabic letter DAD medial form, U+FEC0 */
3, NULL, NULL, EE_PARSE_NONE};
#ifdef XML_UNICODE
const XML_Char *expected = XCS("\xfec0");
#else
const XML_Char *expected = XCS("\xef\xbb\x80");
#endif
CharData storage;
CharData_Init(&storage);
test_data.storage = &storage;
XML_SetExternalEntityRefHandler(g_parser, external_entity_loader2);
XML_SetUserData(g_parser, &test_data);
XML_SetCharacterDataHandler(g_parser, ext2_accumulate_characters);
if (_XML_Parse_SINGLE_BYTES(g_parser, text, (int)strlen(text), XML_TRUE)
== XML_STATUS_ERROR)
xml_failure(g_parser);
CharData_CheckXMLChars(&storage, expected);
}
END_TEST
/* Test that UTF-8 in a CDATA section is correctly passed through */
START_TEST(test_utf8_in_cdata_section) {
const char *text = "<doc><![CDATA[one \xc3\xa9 two]]></doc>";
#ifdef XML_UNICODE
const XML_Char *expected = XCS("one \x00e9 two");
#else
const XML_Char *expected = XCS("one \xc3\xa9 two");
#endif
run_character_check(text, expected);
}
END_TEST
/* Test that little-endian UTF-16 in a CDATA section is handled */
START_TEST(test_utf8_in_cdata_section_2) {
const char *text = "<doc><![CDATA[\xc3\xa9]\xc3\xa9two]]></doc>";
#ifdef XML_UNICODE
const XML_Char *expected = XCS("\x00e9]\x00e9two");
#else
const XML_Char *expected = XCS("\xc3\xa9]\xc3\xa9two");
#endif
run_character_check(text, expected);
}
END_TEST
START_TEST(test_utf8_in_start_tags) {
struct test_case {
bool goodName;
bool goodNameStart;
const char *tagName;
};
// The idea with the tests below is this:
// We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
// go to isNever and are hence not a concern.
//
// We start with a character that is a valid name character
// (or even name-start character, see XML 1.0r4 spec) and then we flip
// single bits at places where (1) the result leaves the UTF-8 encoding space
// and (2) we stay in the same n-byte sequence family.
//
// The flipped bits are highlighted in angle brackets in comments,
// e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
// the most significant bit to 1 to leave UTF-8 encoding space.
struct test_case cases[] = {
// 1-byte UTF-8: [0xxx xxxx]
{true, true, "\x3A"}, // [0011 1010] = ASCII colon ':'
{false, false, "\xBA"}, // [<1>011 1010]
{true, false, "\x39"}, // [0011 1001] = ASCII nine '9'
{false, false, "\xB9"}, // [<1>011 1001]
// 2-byte UTF-8: [110x xxxx] [10xx xxxx]
{true, true, "\xDB\xA5"}, // [1101 1011] [1010 0101] =
// Arabic small waw U+06E5
{false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
{false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
{false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
{true, false, "\xCC\x81"}, // [1100 1100] [1000 0001] =
// combining char U+0301
{false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
{false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
{false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
// 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
{true, true, "\xE0\xA4\x85"}, // [1110 0000] [1010 0100] [1000 0101] =
// Devanagari Letter A U+0905
{false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
{false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
{false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
{false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
{false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
{true, false, "\xE0\xA4\x81"}, // [1110 0000] [1010 0100] [1000 0001] =
// combining char U+0901
{false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
{false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
{false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
{false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
{false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
};
const bool atNameStart[] = {true, false};
size_t i = 0;
char doc[1024];
size_t failCount = 0;
for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
size_t j = 0;
for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
const bool expectedSuccess
= atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
snprintf(doc, sizeof(doc), "<%s%s><!--", atNameStart[j] ? "" : "a",
cases[i].tagName);
XML_Parser parser = XML_ParserCreate(NULL);
const enum XML_Status status
= XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE);
bool success = true;
if ((status == XML_STATUS_OK) != expectedSuccess) {
success = false;
}
if ((status == XML_STATUS_ERROR)
&& (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
success = false;
}
if (! success) {
fprintf(
stderr,
"FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
(unsigned)i + 1u, atNameStart[j] ? " " : "not ",
(unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
failCount++;
}
XML_ParserFree(parser);
}
}
if (failCount > 0) {
fail("UTF-8 regression detected");
}
}
END_TEST
/* Test trailing spaces in elements are accepted */
static void XMLCALL
record_element_end_handler(void *userData, const XML_Char *name) {
@ -6135,11 +5922,6 @@ make_suite(void) {
TCase *tc_accounting = tcase_create("accounting tests");
#endif
tcase_add_test(tc_basic, test_ext_entity_utf16_unknown);
tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom);
tcase_add_test(tc_basic, test_utf8_in_cdata_section);
tcase_add_test(tc_basic, test_utf8_in_cdata_section_2);
tcase_add_test(tc_basic, test_utf8_in_start_tags);
tcase_add_test(tc_basic, test_trailing_spaces_in_elements);
tcase_add_test(tc_basic, test_utf16_attribute);
tcase_add_test(tc_basic, test_utf16_second_attr);