diff --git a/expat/lib/xmlparse.c b/expat/lib/xmlparse.c index a18aa2e3..4227bdfa 100644 --- a/expat/lib/xmlparse.c +++ b/expat/lib/xmlparse.c @@ -36,6 +36,7 @@ Copyright (c) 2022 Samanta Navarro Copyright (c) 2022 Jeffrey Walton Copyright (c) 2022 Jann Horn + Copyright (c) 2023 Sony Corporation / Snild Dolkow Licensed under the MIT license: Permission is hereby granted, free of charge, to any person obtaining @@ -4483,15 +4484,15 @@ entityValueInitProcessor(XML_Parser parser, const char *s, const char *end, parser->m_processor = entityValueProcessor; return entityValueProcessor(parser, next, end, nextPtr); } - /* If we are at the end of the buffer, this would cause XmlPrologTok to - return XML_TOK_NONE on the next call, which would then cause the - function to exit with *nextPtr set to s - that is what we want for other - tokens, but not for the BOM - we would rather like to skip it; - then, when this routine is entered the next time, XmlPrologTok will - return XML_TOK_INVALID, since the BOM is still in the buffer + /* XmlPrologTok has now set the encoding based on the BOM it found, and we + must move s and nextPtr forward to consume the BOM. + + If we didn't, and got XML_TOK_NONE from the next XmlPrologTok call, we + would leave the BOM in the buffer and return. On the next call to this + function, our XmlPrologTok call would return XML_TOK_INVALID, since it + is not valid to have multiple BOMs. */ - else if (tok == XML_TOK_BOM && next == end - && ! parser->m_parsingStatus.finalBuffer) { + else if (tok == XML_TOK_BOM) { # ifdef XML_DTD if (! accountingDiffTolerated(parser, tok, s, next, __LINE__, XML_ACCOUNT_DIRECT)) { @@ -4501,7 +4502,7 @@ entityValueInitProcessor(XML_Parser parser, const char *s, const char *end, # endif *nextPtr = next; - return XML_ERROR_NONE; + s = next; } /* If we get this token, we have the start of what might be a normal tag, but not a declaration (i.e. it doesn't begin with diff --git a/expat/tests/basic_tests.c b/expat/tests/basic_tests.c index 86045c21..6f7e5a5e 100644 --- a/expat/tests/basic_tests.c +++ b/expat/tests/basic_tests.c @@ -2955,6 +2955,83 @@ START_TEST(test_bad_ignore_section) { } END_TEST +struct bom_testdata { + const char *external; + int split; + XML_Bool nested_callback_happened; +}; + +static int XMLCALL +external_bom_checker(XML_Parser parser, const XML_Char *context, + const XML_Char *base, const XML_Char *systemId, + const XML_Char *publicId) { + const char *text = ""; + UNUSED_P(base); + UNUSED_P(systemId); + UNUSED_P(publicId); + + XML_Parser ext_parser = XML_ExternalEntityParserCreate(parser, context, NULL); + if (ext_parser == NULL) + fail("Could not create external entity parser"); + + if (! xcstrcmp(systemId, XCS("004-2.ent"))) { + struct bom_testdata *const testdata + = (struct bom_testdata *)XML_GetUserData(parser); + const char *const external = testdata->external; + const int split = testdata->split; + testdata->nested_callback_happened = XML_TRUE; + + if (XML_Parse(ext_parser, external, split, XML_FALSE) != XML_STATUS_OK) { + xml_failure(ext_parser); + } + text = external + split; // the parse below will continue where we left off. + } else if (! xcstrcmp(systemId, XCS("004-1.ent"))) { + text = "\n" + "\n" + "\n"; + } else { + fail("unknown systemId"); + } + + if (XML_Parse(ext_parser, text, (int)strlen(text), XML_TRUE) != XML_STATUS_OK) + xml_failure(ext_parser); + + XML_ParserFree(ext_parser); + return XML_STATUS_OK; +} + +/* regression test: BOM should be consumed when followed by a partial token. */ +START_TEST(test_external_bom_consumed) { + const char *const text = "\n" + "\n"; + const char *const external = "\xEF\xBB\xBF"; + const int len = (int)strlen(external); + for (int split = 0; split <= len; ++split) { + set_subtest("split at byte %d", split); + + struct bom_testdata testdata; + testdata.external = external; + testdata.split = split; + testdata.nested_callback_happened = XML_FALSE; + + XML_Parser parser = XML_ParserCreate(NULL); + if (parser == NULL) { + fail("Couldn't create parser"); + } + XML_SetParamEntityParsing(parser, XML_PARAM_ENTITY_PARSING_ALWAYS); + XML_SetExternalEntityRefHandler(parser, external_bom_checker); + XML_SetUserData(parser, &testdata); + if (_XML_Parse_SINGLE_BYTES(parser, text, (int)strlen(text), XML_TRUE) + == XML_STATUS_ERROR) + xml_failure(parser); + if (! testdata.nested_callback_happened) { + fail("ref handler not called"); + } + XML_ParserFree(parser); + } +} +END_TEST + /* Test recursive parsing */ START_TEST(test_external_entity_values) { const char *text = "\n" @@ -5047,6 +5124,7 @@ make_basic_test_case(Suite *s) { tcase_add_test__ifdef_xml_dtd(tc_basic, test_ignore_section_utf16); tcase_add_test__ifdef_xml_dtd(tc_basic, test_ignore_section_utf16_be); tcase_add_test__ifdef_xml_dtd(tc_basic, test_bad_ignore_section); + tcase_add_test__ifdef_xml_dtd(tc_basic, test_external_bom_consumed); tcase_add_test__ifdef_xml_dtd(tc_basic, test_external_entity_values); tcase_add_test__ifdef_xml_dtd(tc_basic, test_ext_entity_not_standalone); tcase_add_test__ifdef_xml_dtd(tc_basic, test_ext_entity_value_abort);