mirror of
https://github.com/libexpat/libexpat.git
synced 2025-04-05 21:24:59 +00:00
Always consume BOM bytes when found in prolog
The byte order mark is not correctly consumed when followed by an incomplete token in a non-final parse. This results in the BOM staying in the buffer, causing an invalid token error later. This was not detected by existing tests because they either parse everything in one call, or add a single byte at a time. By moving `s` forward when we find a BOM, we make sure that the BOM bytes are properly consumed in all cases.
This commit is contained in:
parent
c803b93e87
commit
b1e955449c
2 changed files with 88 additions and 9 deletions
|
@ -36,6 +36,7 @@
|
|||
Copyright (c) 2022 Samanta Navarro <ferivoz@riseup.net>
|
||||
Copyright (c) 2022 Jeffrey Walton <noloader@gmail.com>
|
||||
Copyright (c) 2022 Jann Horn <jannh@google.com>
|
||||
Copyright (c) 2023 Sony Corporation / Snild Dolkow <snild@sony.com>
|
||||
Licensed under the MIT license:
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
|
@ -4483,15 +4484,15 @@ entityValueInitProcessor(XML_Parser parser, const char *s, const char *end,
|
|||
parser->m_processor = entityValueProcessor;
|
||||
return entityValueProcessor(parser, next, end, nextPtr);
|
||||
}
|
||||
/* If we are at the end of the buffer, this would cause XmlPrologTok to
|
||||
return XML_TOK_NONE on the next call, which would then cause the
|
||||
function to exit with *nextPtr set to s - that is what we want for other
|
||||
tokens, but not for the BOM - we would rather like to skip it;
|
||||
then, when this routine is entered the next time, XmlPrologTok will
|
||||
return XML_TOK_INVALID, since the BOM is still in the buffer
|
||||
/* XmlPrologTok has now set the encoding based on the BOM it found, and we
|
||||
must move s and nextPtr forward to consume the BOM.
|
||||
|
||||
If we didn't, and got XML_TOK_NONE from the next XmlPrologTok call, we
|
||||
would leave the BOM in the buffer and return. On the next call to this
|
||||
function, our XmlPrologTok call would return XML_TOK_INVALID, since it
|
||||
is not valid to have multiple BOMs.
|
||||
*/
|
||||
else if (tok == XML_TOK_BOM && next == end
|
||||
&& ! parser->m_parsingStatus.finalBuffer) {
|
||||
else if (tok == XML_TOK_BOM) {
|
||||
# ifdef XML_DTD
|
||||
if (! accountingDiffTolerated(parser, tok, s, next, __LINE__,
|
||||
XML_ACCOUNT_DIRECT)) {
|
||||
|
@ -4501,7 +4502,7 @@ entityValueInitProcessor(XML_Parser parser, const char *s, const char *end,
|
|||
# endif
|
||||
|
||||
*nextPtr = next;
|
||||
return XML_ERROR_NONE;
|
||||
s = next;
|
||||
}
|
||||
/* If we get this token, we have the start of what might be a
|
||||
normal tag, but not a declaration (i.e. it doesn't begin with
|
||||
|
|
|
@ -2955,6 +2955,83 @@ START_TEST(test_bad_ignore_section) {
|
|||
}
|
||||
END_TEST
|
||||
|
||||
struct bom_testdata {
|
||||
const char *external;
|
||||
int split;
|
||||
XML_Bool nested_callback_happened;
|
||||
};
|
||||
|
||||
static int XMLCALL
|
||||
external_bom_checker(XML_Parser parser, const XML_Char *context,
|
||||
const XML_Char *base, const XML_Char *systemId,
|
||||
const XML_Char *publicId) {
|
||||
const char *text = "";
|
||||
UNUSED_P(base);
|
||||
UNUSED_P(systemId);
|
||||
UNUSED_P(publicId);
|
||||
|
||||
XML_Parser ext_parser = XML_ExternalEntityParserCreate(parser, context, NULL);
|
||||
if (ext_parser == NULL)
|
||||
fail("Could not create external entity parser");
|
||||
|
||||
if (! xcstrcmp(systemId, XCS("004-2.ent"))) {
|
||||
struct bom_testdata *const testdata
|
||||
= (struct bom_testdata *)XML_GetUserData(parser);
|
||||
const char *const external = testdata->external;
|
||||
const int split = testdata->split;
|
||||
testdata->nested_callback_happened = XML_TRUE;
|
||||
|
||||
if (XML_Parse(ext_parser, external, split, XML_FALSE) != XML_STATUS_OK) {
|
||||
xml_failure(ext_parser);
|
||||
}
|
||||
text = external + split; // the parse below will continue where we left off.
|
||||
} else if (! xcstrcmp(systemId, XCS("004-1.ent"))) {
|
||||
text = "<!ELEMENT doc EMPTY>\n"
|
||||
"<!ENTITY % e1 SYSTEM '004-2.ent'>\n"
|
||||
"<!ENTITY % e2 '%e1;'>\n";
|
||||
} else {
|
||||
fail("unknown systemId");
|
||||
}
|
||||
|
||||
if (XML_Parse(ext_parser, text, (int)strlen(text), XML_TRUE) != XML_STATUS_OK)
|
||||
xml_failure(ext_parser);
|
||||
|
||||
XML_ParserFree(ext_parser);
|
||||
return XML_STATUS_OK;
|
||||
}
|
||||
|
||||
/* regression test: BOM should be consumed when followed by a partial token. */
|
||||
START_TEST(test_external_bom_consumed) {
|
||||
const char *const text = "<!DOCTYPE doc SYSTEM '004-1.ent'>\n"
|
||||
"<doc></doc>\n";
|
||||
const char *const external = "\xEF\xBB\xBF<!ATTLIST doc a1 CDATA 'value'>";
|
||||
const int len = (int)strlen(external);
|
||||
for (int split = 0; split <= len; ++split) {
|
||||
set_subtest("split at byte %d", split);
|
||||
|
||||
struct bom_testdata testdata;
|
||||
testdata.external = external;
|
||||
testdata.split = split;
|
||||
testdata.nested_callback_happened = XML_FALSE;
|
||||
|
||||
XML_Parser parser = XML_ParserCreate(NULL);
|
||||
if (parser == NULL) {
|
||||
fail("Couldn't create parser");
|
||||
}
|
||||
XML_SetParamEntityParsing(parser, XML_PARAM_ENTITY_PARSING_ALWAYS);
|
||||
XML_SetExternalEntityRefHandler(parser, external_bom_checker);
|
||||
XML_SetUserData(parser, &testdata);
|
||||
if (_XML_Parse_SINGLE_BYTES(parser, text, (int)strlen(text), XML_TRUE)
|
||||
== XML_STATUS_ERROR)
|
||||
xml_failure(parser);
|
||||
if (! testdata.nested_callback_happened) {
|
||||
fail("ref handler not called");
|
||||
}
|
||||
XML_ParserFree(parser);
|
||||
}
|
||||
}
|
||||
END_TEST
|
||||
|
||||
/* Test recursive parsing */
|
||||
START_TEST(test_external_entity_values) {
|
||||
const char *text = "<!DOCTYPE doc SYSTEM '004-1.ent'>\n"
|
||||
|
@ -5047,6 +5124,7 @@ make_basic_test_case(Suite *s) {
|
|||
tcase_add_test__ifdef_xml_dtd(tc_basic, test_ignore_section_utf16);
|
||||
tcase_add_test__ifdef_xml_dtd(tc_basic, test_ignore_section_utf16_be);
|
||||
tcase_add_test__ifdef_xml_dtd(tc_basic, test_bad_ignore_section);
|
||||
tcase_add_test__ifdef_xml_dtd(tc_basic, test_external_bom_consumed);
|
||||
tcase_add_test__ifdef_xml_dtd(tc_basic, test_external_entity_values);
|
||||
tcase_add_test__ifdef_xml_dtd(tc_basic, test_ext_entity_not_standalone);
|
||||
tcase_add_test__ifdef_xml_dtd(tc_basic, test_ext_entity_value_abort);
|
||||
|
|
Loading…
Add table
Reference in a new issue