Always consume BOM bytes when found in prolog

The byte order mark is not correctly consumed when followed by an
incomplete token in a non-final parse. This results in the BOM staying
in the buffer, causing an invalid token error later.

This was not detected by existing tests because they either parse
everything in one call, or add a single byte at a time.

By moving `s` forward when we find a BOM, we make sure that the BOM
bytes are properly consumed in all cases.
This commit is contained in:
Snild Dolkow 2023-08-31 12:36:43 +02:00
parent c803b93e87
commit b1e955449c
2 changed files with 88 additions and 9 deletions

View file

@ -36,6 +36,7 @@
Copyright (c) 2022 Samanta Navarro <ferivoz@riseup.net>
Copyright (c) 2022 Jeffrey Walton <noloader@gmail.com>
Copyright (c) 2022 Jann Horn <jannh@google.com>
Copyright (c) 2023 Sony Corporation / Snild Dolkow <snild@sony.com>
Licensed under the MIT license:
Permission is hereby granted, free of charge, to any person obtaining
@ -4483,15 +4484,15 @@ entityValueInitProcessor(XML_Parser parser, const char *s, const char *end,
parser->m_processor = entityValueProcessor;
return entityValueProcessor(parser, next, end, nextPtr);
}
/* If we are at the end of the buffer, this would cause XmlPrologTok to
return XML_TOK_NONE on the next call, which would then cause the
function to exit with *nextPtr set to s - that is what we want for other
tokens, but not for the BOM - we would rather like to skip it;
then, when this routine is entered the next time, XmlPrologTok will
return XML_TOK_INVALID, since the BOM is still in the buffer
/* XmlPrologTok has now set the encoding based on the BOM it found, and we
must move s and nextPtr forward to consume the BOM.
If we didn't, and got XML_TOK_NONE from the next XmlPrologTok call, we
would leave the BOM in the buffer and return. On the next call to this
function, our XmlPrologTok call would return XML_TOK_INVALID, since it
is not valid to have multiple BOMs.
*/
else if (tok == XML_TOK_BOM && next == end
&& ! parser->m_parsingStatus.finalBuffer) {
else if (tok == XML_TOK_BOM) {
# ifdef XML_DTD
if (! accountingDiffTolerated(parser, tok, s, next, __LINE__,
XML_ACCOUNT_DIRECT)) {
@ -4501,7 +4502,7 @@ entityValueInitProcessor(XML_Parser parser, const char *s, const char *end,
# endif
*nextPtr = next;
return XML_ERROR_NONE;
s = next;
}
/* If we get this token, we have the start of what might be a
normal tag, but not a declaration (i.e. it doesn't begin with

View file

@ -2955,6 +2955,83 @@ START_TEST(test_bad_ignore_section) {
}
END_TEST
struct bom_testdata {
const char *external;
int split;
XML_Bool nested_callback_happened;
};
static int XMLCALL
external_bom_checker(XML_Parser parser, const XML_Char *context,
const XML_Char *base, const XML_Char *systemId,
const XML_Char *publicId) {
const char *text = "";
UNUSED_P(base);
UNUSED_P(systemId);
UNUSED_P(publicId);
XML_Parser ext_parser = XML_ExternalEntityParserCreate(parser, context, NULL);
if (ext_parser == NULL)
fail("Could not create external entity parser");
if (! xcstrcmp(systemId, XCS("004-2.ent"))) {
struct bom_testdata *const testdata
= (struct bom_testdata *)XML_GetUserData(parser);
const char *const external = testdata->external;
const int split = testdata->split;
testdata->nested_callback_happened = XML_TRUE;
if (XML_Parse(ext_parser, external, split, XML_FALSE) != XML_STATUS_OK) {
xml_failure(ext_parser);
}
text = external + split; // the parse below will continue where we left off.
} else if (! xcstrcmp(systemId, XCS("004-1.ent"))) {
text = "<!ELEMENT doc EMPTY>\n"
"<!ENTITY % e1 SYSTEM '004-2.ent'>\n"
"<!ENTITY % e2 '%e1;'>\n";
} else {
fail("unknown systemId");
}
if (XML_Parse(ext_parser, text, (int)strlen(text), XML_TRUE) != XML_STATUS_OK)
xml_failure(ext_parser);
XML_ParserFree(ext_parser);
return XML_STATUS_OK;
}
/* regression test: BOM should be consumed when followed by a partial token. */
START_TEST(test_external_bom_consumed) {
const char *const text = "<!DOCTYPE doc SYSTEM '004-1.ent'>\n"
"<doc></doc>\n";
const char *const external = "\xEF\xBB\xBF<!ATTLIST doc a1 CDATA 'value'>";
const int len = (int)strlen(external);
for (int split = 0; split <= len; ++split) {
set_subtest("split at byte %d", split);
struct bom_testdata testdata;
testdata.external = external;
testdata.split = split;
testdata.nested_callback_happened = XML_FALSE;
XML_Parser parser = XML_ParserCreate(NULL);
if (parser == NULL) {
fail("Couldn't create parser");
}
XML_SetParamEntityParsing(parser, XML_PARAM_ENTITY_PARSING_ALWAYS);
XML_SetExternalEntityRefHandler(parser, external_bom_checker);
XML_SetUserData(parser, &testdata);
if (_XML_Parse_SINGLE_BYTES(parser, text, (int)strlen(text), XML_TRUE)
== XML_STATUS_ERROR)
xml_failure(parser);
if (! testdata.nested_callback_happened) {
fail("ref handler not called");
}
XML_ParserFree(parser);
}
}
END_TEST
/* Test recursive parsing */
START_TEST(test_external_entity_values) {
const char *text = "<!DOCTYPE doc SYSTEM '004-1.ent'>\n"
@ -5047,6 +5124,7 @@ make_basic_test_case(Suite *s) {
tcase_add_test__ifdef_xml_dtd(tc_basic, test_ignore_section_utf16);
tcase_add_test__ifdef_xml_dtd(tc_basic, test_ignore_section_utf16_be);
tcase_add_test__ifdef_xml_dtd(tc_basic, test_bad_ignore_section);
tcase_add_test__ifdef_xml_dtd(tc_basic, test_external_bom_consumed);
tcase_add_test__ifdef_xml_dtd(tc_basic, test_external_entity_values);
tcase_add_test__ifdef_xml_dtd(tc_basic, test_ext_entity_not_standalone);
tcase_add_test__ifdef_xml_dtd(tc_basic, test_ext_entity_value_abort);