Merge pull request #753 from SonyMobile/consume-bom

Fix parse-size-dependent "invalid token" error for external entities that start with a byte order mark
This commit is contained in:
Sebastian Pipping 2023-09-22 18:06:57 +02:00 committed by GitHub
commit bcdc25b04d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 143 additions and 9 deletions

View file

@ -36,6 +36,7 @@
Copyright (c) 2022 Samanta Navarro <ferivoz@riseup.net>
Copyright (c) 2022 Jeffrey Walton <noloader@gmail.com>
Copyright (c) 2022 Jann Horn <jannh@google.com>
Copyright (c) 2023 Sony Corporation / Snild Dolkow <snild@sony.com>
Licensed under the MIT license:
Permission is hereby granted, free of charge, to any person obtaining
@ -4483,15 +4484,15 @@ entityValueInitProcessor(XML_Parser parser, const char *s, const char *end,
parser->m_processor = entityValueProcessor;
return entityValueProcessor(parser, next, end, nextPtr);
}
/* If we are at the end of the buffer, this would cause XmlPrologTok to
return XML_TOK_NONE on the next call, which would then cause the
function to exit with *nextPtr set to s - that is what we want for other
tokens, but not for the BOM - we would rather like to skip it;
then, when this routine is entered the next time, XmlPrologTok will
return XML_TOK_INVALID, since the BOM is still in the buffer
/* XmlPrologTok has now set the encoding based on the BOM it found, and we
must move s and nextPtr forward to consume the BOM.
If we didn't, and got XML_TOK_NONE from the next XmlPrologTok call, we
would leave the BOM in the buffer and return. On the next call to this
function, our XmlPrologTok call would return XML_TOK_INVALID, since it
is not valid to have multiple BOMs.
*/
else if (tok == XML_TOK_BOM && next == end
&& ! parser->m_parsingStatus.finalBuffer) {
else if (tok == XML_TOK_BOM) {
# ifdef XML_DTD
if (! accountingDiffTolerated(parser, tok, s, next, __LINE__,
XML_ACCOUNT_DIRECT)) {
@ -4501,7 +4502,7 @@ entityValueInitProcessor(XML_Parser parser, const char *s, const char *end,
# endif
*nextPtr = next;
return XML_ERROR_NONE;
s = next;
}
/* If we get this token, we have the start of what might be a
normal tag, but not a declaration (i.e. it doesn't begin with

View file

@ -18,6 +18,7 @@
Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
Copyright (c) 2020 Tim Gates <tim.gates@iress.com>
Copyright (c) 2021 Dong-hee Na <donghee.na@python.org>
Copyright (c) 2023 Sony Corporation / Snild Dolkow <snild@sony.com>
Licensed under the MIT license:
Permission is hereby granted, free of charge, to any person obtaining
@ -384,6 +385,7 @@ START_TEST(test_helper_unsigned_char_to_printable) {
// Smoke test
unsigned char uc = 0;
for (; uc < (unsigned char)-1; uc++) {
set_subtest("char %u", (unsigned)uc);
const char *const printable = unsignedCharToPrintable(uc);
if (printable == NULL)
fail("unsignedCharToPrintable returned NULL");
@ -392,8 +394,10 @@ START_TEST(test_helper_unsigned_char_to_printable) {
}
// Two concrete samples
set_subtest("char 'A'");
if (strcmp(unsignedCharToPrintable('A'), "A") != 0)
fail("unsignedCharToPrintable result mistaken");
set_subtest("char '\\'");
if (strcmp(unsignedCharToPrintable('\\'), "\\\\") != 0)
fail("unsignedCharToPrintable result mistaken");
}

View file

@ -1212,6 +1212,7 @@ START_TEST(test_ext_entity_invalid_parse) {
const ExtFaults *fault = faults;
for (; fault->parse_text != NULL; fault++) {
set_subtest("\"%s\"", fault->parse_text);
XML_SetParamEntityParsing(g_parser, XML_PARAM_ENTITY_PARSING_ALWAYS);
XML_SetExternalEntityRefHandler(g_parser, external_entity_faulter);
XML_SetUserData(g_parser, (void *)fault);
@ -1282,6 +1283,7 @@ START_TEST(test_dtd_attr_handling) {
AttTest *test;
for (test = attr_data; test->definition != NULL; test++) {
set_subtest("%s", test->definition);
XML_SetAttlistDeclHandler(g_parser, verify_attlist_decl_handler);
XML_SetUserData(g_parser, test);
if (_XML_Parse_SINGLE_BYTES(g_parser, prolog, (int)strlen(prolog),
@ -1670,6 +1672,7 @@ START_TEST(test_bad_cdata) {
size_t i = 0;
for (; i < sizeof(cases) / sizeof(struct CaseData); i++) {
set_subtest("%s", cases[i].text);
const enum XML_Status actualStatus = _XML_Parse_SINGLE_BYTES(
g_parser, cases[i].text, (int)strlen(cases[i].text), XML_TRUE);
const enum XML_Error actualError = XML_GetErrorCode(g_parser);
@ -1737,6 +1740,7 @@ START_TEST(test_bad_cdata_utf16) {
size_t i;
for (i = 0; i < sizeof(cases) / sizeof(struct CaseData); i++) {
set_subtest("case %lu", (long unsigned)(i + 1));
enum XML_Status actual_status;
enum XML_Error actual_error;
@ -2336,6 +2340,7 @@ START_TEST(test_ext_entity_invalid_suspended_parse) {
ExtFaults *fault;
for (fault = &faults[0]; fault->parse_text != NULL; fault++) {
set_subtest("%s", fault->parse_text);
XML_SetParamEntityParsing(g_parser, XML_PARAM_ENTITY_PARSING_ALWAYS);
XML_SetExternalEntityRefHandler(g_parser,
external_entity_suspending_faulter);
@ -2939,6 +2944,7 @@ START_TEST(test_bad_ignore_section) {
ExtFaults *fault;
for (fault = &faults[0]; fault->parse_text != NULL; fault++) {
set_subtest("%s", fault->parse_text);
XML_SetParamEntityParsing(g_parser, XML_PARAM_ENTITY_PARSING_ALWAYS);
XML_SetExternalEntityRefHandler(g_parser, external_entity_faulter);
XML_SetUserData(g_parser, fault);
@ -2949,6 +2955,83 @@ START_TEST(test_bad_ignore_section) {
}
END_TEST
struct bom_testdata {
const char *external;
int split;
XML_Bool nested_callback_happened;
};
static int XMLCALL
external_bom_checker(XML_Parser parser, const XML_Char *context,
const XML_Char *base, const XML_Char *systemId,
const XML_Char *publicId) {
const char *text = "";
UNUSED_P(base);
UNUSED_P(systemId);
UNUSED_P(publicId);
XML_Parser ext_parser = XML_ExternalEntityParserCreate(parser, context, NULL);
if (ext_parser == NULL)
fail("Could not create external entity parser");
if (! xcstrcmp(systemId, XCS("004-2.ent"))) {
struct bom_testdata *const testdata
= (struct bom_testdata *)XML_GetUserData(parser);
const char *const external = testdata->external;
const int split = testdata->split;
testdata->nested_callback_happened = XML_TRUE;
if (XML_Parse(ext_parser, external, split, XML_FALSE) != XML_STATUS_OK) {
xml_failure(ext_parser);
}
text = external + split; // the parse below will continue where we left off.
} else if (! xcstrcmp(systemId, XCS("004-1.ent"))) {
text = "<!ELEMENT doc EMPTY>\n"
"<!ENTITY % e1 SYSTEM '004-2.ent'>\n"
"<!ENTITY % e2 '%e1;'>\n";
} else {
fail("unknown systemId");
}
if (XML_Parse(ext_parser, text, (int)strlen(text), XML_TRUE) != XML_STATUS_OK)
xml_failure(ext_parser);
XML_ParserFree(ext_parser);
return XML_STATUS_OK;
}
/* regression test: BOM should be consumed when followed by a partial token. */
START_TEST(test_external_bom_consumed) {
const char *const text = "<!DOCTYPE doc SYSTEM '004-1.ent'>\n"
"<doc></doc>\n";
const char *const external = "\xEF\xBB\xBF<!ATTLIST doc a1 CDATA 'value'>";
const int len = (int)strlen(external);
for (int split = 0; split <= len; ++split) {
set_subtest("split at byte %d", split);
struct bom_testdata testdata;
testdata.external = external;
testdata.split = split;
testdata.nested_callback_happened = XML_FALSE;
XML_Parser parser = XML_ParserCreate(NULL);
if (parser == NULL) {
fail("Couldn't create parser");
}
XML_SetParamEntityParsing(parser, XML_PARAM_ENTITY_PARSING_ALWAYS);
XML_SetExternalEntityRefHandler(parser, external_bom_checker);
XML_SetUserData(parser, &testdata);
if (_XML_Parse_SINGLE_BYTES(parser, text, (int)strlen(text), XML_TRUE)
== XML_STATUS_ERROR)
xml_failure(parser);
if (! testdata.nested_callback_happened) {
fail("ref handler not called");
}
XML_ParserFree(parser);
}
}
END_TEST
/* Test recursive parsing */
START_TEST(test_external_entity_values) {
const char *text = "<!DOCTYPE doc SYSTEM '004-1.ent'>\n"
@ -2982,6 +3065,7 @@ START_TEST(test_external_entity_values) {
int i;
for (i = 0; data_004_2[i].parse_text != NULL; i++) {
set_subtest("%s", data_004_2[i].parse_text);
XML_SetParamEntityParsing(g_parser, XML_PARAM_ENTITY_PARSING_ALWAYS);
XML_SetExternalEntityRefHandler(g_parser, external_entity_valuer);
XML_SetUserData(g_parser, &data_004_2[i]);
@ -5040,6 +5124,7 @@ make_basic_test_case(Suite *s) {
tcase_add_test__ifdef_xml_dtd(tc_basic, test_ignore_section_utf16);
tcase_add_test__ifdef_xml_dtd(tc_basic, test_ignore_section_utf16_be);
tcase_add_test__ifdef_xml_dtd(tc_basic, test_bad_ignore_section);
tcase_add_test__ifdef_xml_dtd(tc_basic, test_external_bom_consumed);
tcase_add_test__ifdef_xml_dtd(tc_basic, test_external_entity_values);
tcase_add_test__ifdef_xml_dtd(tc_basic, test_ext_entity_not_standalone);
tcase_add_test__ifdef_xml_dtd(tc_basic, test_ext_entity_value_abort);

View file

@ -15,6 +15,7 @@
Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
Copyright (c) 2018 Marco Maggi <marco.maggi-ipsu@poste.it>
Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
Copyright (c) 2023 Sony Corporation / Snild Dolkow <snild@sony.com>
Licensed under the MIT license:
Permission is hereby granted, free of charge, to any person obtaining
@ -41,6 +42,7 @@
# undef NDEBUG /* because test suite relies on assert(...) at the moment */
#endif
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <setjmp.h>
@ -137,17 +139,35 @@ srunner_create(Suite *suite) {
static jmp_buf env;
#define SUBTEST_LEN (50) // informative, but not too long
static char const *_check_current_function = NULL;
static char _check_current_subtest[SUBTEST_LEN];
static int _check_current_lineno = -1;
static char const *_check_current_filename = NULL;
void
_check_set_test_info(char const *function, char const *filename, int lineno) {
_check_current_function = function;
set_subtest("%s", "");
_check_current_lineno = lineno;
_check_current_filename = filename;
}
void
set_subtest(char const *fmt, ...) {
va_list ap;
va_start(ap, fmt);
vsnprintf(_check_current_subtest, SUBTEST_LEN, fmt, ap);
va_end(ap);
// replace line feeds with spaces, for nicer error logs
for (size_t i = 0; i < SUBTEST_LEN; ++i) {
if (_check_current_subtest[i] == '\n') {
_check_current_subtest[i] = ' ';
}
}
_check_current_subtest[SUBTEST_LEN - 1] = '\0'; // ensure termination
}
static void
handle_success(int verbosity) {
if (verbosity >= CK_VERBOSE) {
@ -159,6 +179,9 @@ static void
handle_failure(SRunner *runner, int verbosity, const char *phase_info) {
runner->nfailures++;
if (verbosity != CK_SILENT) {
if (strlen(_check_current_subtest) != 0) {
phase_info = _check_current_subtest;
}
printf("FAIL: %s (%s at %s:%d)\n", _check_current_function, phase_info,
_check_current_filename, _check_current_lineno);
}
@ -175,6 +198,7 @@ srunner_run_all(SRunner *runner, int verbosity) {
volatile int i;
for (i = 0; i < tc->ntests; ++i) {
runner->nchecks++;
set_subtest("%s", "");
if (tc->setup != NULL) {
/* setup */
@ -190,6 +214,7 @@ srunner_run_all(SRunner *runner, int verbosity) {
continue;
}
(tc->tests[i])();
set_subtest("%s", "");
/* teardown */
if (tc->teardown != NULL) {

View file

@ -15,6 +15,7 @@
Copyright (c) 2004-2006 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
Copyright (c) 2006-2012 Karl Waclawek <karl@waclawek.net>
Copyright (c) 2016-2017 Sebastian Pipping <sebastian@pipping.org>
Copyright (c) 2023 Sony Corporation / Snild Dolkow <snild@sony.com>
Licensed under the MIT license:
Permission is hereby granted, free of charge, to any person obtaining
@ -59,6 +60,19 @@ extern "C" {
# define __func__ __FUNCTION__
# endif
/* PRINTF_LIKE has two effects:
1. Make clang's -Wformat-nonliteral stop warning about non-literal format
strings in annotated functions' code.
2. Make both clang and gcc's -Wformat-nonliteral warn about *callers* of
the annotated function that use a non-literal format string.
*/
# if defined(__GNUC__)
# define PRINTF_LIKE(fmtpos, argspos) \
__attribute__((format(printf, fmtpos, argspos)))
# else
# define PRINTF_LIKE(fmtpos, argspos)
# endif
# define START_TEST(testname) \
static void testname(void) { \
_check_set_test_info(__func__, __FILE__, __LINE__); \
@ -67,6 +81,8 @@ extern "C" {
} \
}
void PRINTF_LIKE(1, 2) set_subtest(char const *fmt, ...);
# define fail(msg) _fail_unless(0, __FILE__, __LINE__, msg)
typedef void (*tcase_setup_function)(void);

View file

@ -18,6 +18,7 @@
Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
Copyright (c) 2020 Tim Gates <tim.gates@iress.com>
Copyright (c) 2021 Dong-hee Na <donghee.na@python.org>
Copyright (c) 2023 Sony Corporation / Snild Dolkow <snild@sony.com>
Licensed under the MIT license:
Permission is hereby granted, free of charge, to any person obtaining
@ -328,6 +329,7 @@ START_TEST(test_misc_deny_internal_entity_closing_doctype_issue_317) {
size_t inputIndex = 0;
for (; inputIndex < sizeof(inputs) / sizeof(inputs[0]); inputIndex++) {
set_subtest("%s", inputs[inputIndex]);
XML_Parser parser;
enum XML_Status parseResult;
int setParamEntityResult;

View file

@ -694,6 +694,7 @@ START_TEST(test_ns_separator_in_uri) {
size_t i = 0;
size_t failCount = 0;
for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
set_subtest("%s", cases[i].doc);
XML_Parser parser = XML_ParserCreateNS(NULL, cases[i].namesep);
XML_SetElementHandler(parser, dummy_start_element, dummy_end_element);
if (XML_Parse(parser, cases[i].doc, (int)strlen(cases[i].doc),