Add app setting for enabling/disabling reparse heuristic

Suggested-by: Sebastian Pipping <sebastian@pipping.org>
CI-fighting-assistance-by: Sebastian Pipping <sebastian@pipping.org>
This commit is contained in:
Snild Dolkow 2023-09-11 15:31:24 +02:00
parent 09957b8ced
commit 1d3162da8a
6 changed files with 196 additions and 7 deletions

View file

@ -53,6 +53,7 @@ XML_SetNotationDeclHandler
XML_SetNotStandaloneHandler
XML_SetParamEntityParsing
XML_SetProcessingInstructionHandler
XML_SetReparseDeferralEnabled
XML_SetReturnNSTriplet
XML_SetSkippedEntityHandler
XML_SetStartCdataSectionHandler

View file

@ -152,10 +152,11 @@ interface.</p>
</ul>
</li>
<li>
<a href="#billion-laughs">Billion Laughs Attack Protection</a>
<a href="#attack-protection">Attack Protection</a>
<ul>
<li><a href="#XML_SetBillionLaughsAttackProtectionMaximumAmplification">XML_SetBillionLaughsAttackProtectionMaximumAmplification</a></li>
<li><a href="#XML_SetBillionLaughsAttackProtectionActivationThreshold">XML_SetBillionLaughsAttackProtectionActivationThreshold</a></li>
<li><a href="#XML_SetReparseDeferralEnabled">XML_SetReparseDeferralEnabled</a></li>
</ul>
</li>
<li><a href="#miscellaneous">Miscellaneous Functions</a>
@ -2167,11 +2168,7 @@ parse position may be before the beginning of the buffer.</p>
return <code>NULL</code>.</p>
</div>
<h3><a name="billion-laughs">Billion Laughs Attack Protection</a></h3>
<p>The functions in this section configure the built-in
protection against various forms of
<a href="https://en.wikipedia.org/wiki/Billion_laughs_attack">billion laughs attacks</a>.</p>
<h3><a name="attack-protection">Attack Protection</a><a name="billion-laughs"></a></h3>
<h4 id="XML_SetBillionLaughsAttackProtectionMaximumAmplification">XML_SetBillionLaughsAttackProtectionMaximumAmplification</h4>
<pre class="fcndec">
@ -2259,6 +2256,27 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold(XML_Parser p,
</p>
</div>
<h4 id="XML_SetReparseDeferralEnabled">XML_SetReparseDeferralEnabled</h4>
<pre class="fcndec">
/* Added in Expat 2.6.0. */
XML_Bool XMLCALL
XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled);
</pre>
<div class="fcndef">
<p>
Large tokens may require many parse calls before enough data is available for Expat to parse it in full.
If Expat retried parsing the token on every parse call, parsing could take quadratic time.
To avoid this, Expat only retries once a significant amount of new data is available.
This function allows disabling this behavior.
</p>
<p>
The <code>enabled</code> argument should be <code>XML_TRUE</code> or <code>XML_FALSE</code>.
</p>
<p>
Returns <code>XML_TRUE</code> on success, and <code>XML_FALSE</code> on error.
</p>
</div>
<h3><a name="miscellaneous">Miscellaneous functions</a></h3>
<p>The functions in this section either obtain state information from

View file

@ -16,6 +16,7 @@
Copyright (c) 2016 Thomas Beutlich <tc@tbeu.de>
Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
Copyright (c) 2022 Thijs Schreijer <thijs@thijsschreijer.nl>
Copyright (c) 2023 Sony Corporation / Snild Dolkow <snild@sony.com>
Licensed under the MIT license:
Permission is hereby granted, free of charge, to any person obtaining
@ -1054,6 +1055,10 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold(
XML_Parser parser, unsigned long long activationThresholdBytes);
#endif
/* Added in Expat 2.6.0. */
XMLPARSEAPI(XML_Bool)
XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled);
/* Expat follows the semantic versioning convention.
See https://semver.org
*/

View file

@ -77,3 +77,5 @@ EXPORTS
; added with version 2.4.0
@_EXPAT_COMMENT_DTD_OR_GE@ XML_SetBillionLaughsAttackProtectionActivationThreshold @69
@_EXPAT_COMMENT_DTD_OR_GE@ XML_SetBillionLaughsAttackProtectionMaximumAmplification @70
; added with version 2.6.0
XML_SetReparseDeferralEnabled @71

View file

@ -651,6 +651,7 @@ struct XML_ParserStruct {
XML_Index m_parseEndByteIndex;
const char *m_parseEndPtr;
size_t m_partialTokenBytesBefore; /* used in heuristic to avoid O(n^2) */
XML_Bool m_reparseDeferralEnabled;
XML_Char *m_dataBuf;
XML_Char *m_dataBufEnd;
XML_StartElementHandler m_startElementHandler;
@ -987,7 +988,7 @@ callProcessor(XML_Parser parser, const char *start, const char *end,
const char **endPtr) {
const size_t have_now = EXPAT_SAFE_PTR_DIFF(end, start);
if (g_reparseDeferralEnabledDefault
if (parser->m_reparseDeferralEnabled
&& ! parser->m_parsingStatus.finalBuffer) {
// Heuristic: don't try to parse a partial token again until the amount of
// available data has increased significantly.
@ -1193,6 +1194,7 @@ parserInit(XML_Parser parser, const XML_Char *encodingName) {
parser->m_parseEndByteIndex = 0;
parser->m_parseEndPtr = NULL;
parser->m_partialTokenBytesBefore = 0;
parser->m_reparseDeferralEnabled = g_reparseDeferralEnabledDefault;
parser->m_declElementType = NULL;
parser->m_declAttributeId = NULL;
parser->m_declEntity = NULL;
@ -2617,6 +2619,15 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold(
}
#endif /* XML_GE == 1 */
XML_Bool XMLCALL
XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled) {
if (parser != NULL && (enabled == XML_TRUE || enabled == XML_FALSE)) {
parser->m_reparseDeferralEnabled = enabled;
return XML_TRUE;
}
return XML_FALSE;
}
/* Initially tag->rawName always points into the parse buffer;
for those TAG instances opened while the current parse buffer was
processed, and not yet closed, we need to store tag->rawName in a more

View file

@ -5304,6 +5304,154 @@ START_TEST(test_big_tokens_take_linear_time) {
}
END_TEST
START_TEST(test_set_reparse_deferral) {
const char *const pre = "<d>";
const char *const start = "<x attr='";
const char *const end = "'></x>";
char eeeeee[100];
const int fillsize = (int)sizeof(eeeeee);
memset(eeeeee, 'e', fillsize);
for (int enabled = 0; enabled <= 1; enabled += 1) {
set_subtest("deferral=%d", enabled);
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(parser != NULL);
assert_true(XML_SetReparseDeferralEnabled(parser, enabled));
CharData storage;
CharData_Init(&storage);
XML_SetUserData(parser, &storage);
XML_SetStartElementHandler(parser, start_element_event_handler);
enum XML_Status status;
// parse the start text
status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
CharData_CheckXMLChars(&storage, XCS("d")); // first element should be done
// ..and the start of the token
status = XML_Parse(parser, start, (int)strlen(start), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
CharData_CheckXMLChars(&storage, XCS("d")); // still just the first one
// try to parse lots of 'e', but the token isn't finished
for (int c = 0; c < 100; ++c) {
status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
}
CharData_CheckXMLChars(&storage, XCS("d")); // *still* just the first one
// end the <x> token.
status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
if (enabled) {
// In general, we may need to push more data to trigger a reparse attempt,
// but in this test, the data is constructed to always require it.
CharData_CheckXMLChars(&storage, XCS("d")); // or the test is incorrect
// 2x the token length should suffice; the +1 covers the start and end.
for (int c = 0; c < 101; ++c) {
status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
}
}
CharData_CheckXMLChars(&storage, XCS("dx")); // the <x> should be done
XML_ParserFree(parser);
}
}
END_TEST
START_TEST(test_set_reparse_deferral_on_null_parser) {
assert_true(XML_SetReparseDeferralEnabled(NULL, 0) == XML_FALSE);
assert_true(XML_SetReparseDeferralEnabled(NULL, 1) == XML_FALSE);
assert_true(XML_SetReparseDeferralEnabled(NULL, 10) == XML_FALSE);
assert_true(XML_SetReparseDeferralEnabled(NULL, 100) == XML_FALSE);
assert_true(XML_SetReparseDeferralEnabled(NULL, (XML_Bool)INT_MIN)
== XML_FALSE);
assert_true(XML_SetReparseDeferralEnabled(NULL, (XML_Bool)INT_MAX)
== XML_FALSE);
}
END_TEST
START_TEST(test_set_reparse_deferral_on_the_fly) {
const char *const pre = "<d><x attr='";
const char *const end = "'></x";
const char *const post = ">";
char iiiiii[100];
const int fillsize = (int)sizeof(iiiiii);
memset(iiiiii, 'i', fillsize);
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(parser != NULL);
assert_true(XML_SetReparseDeferralEnabled(parser, XML_TRUE));
CharData storage;
CharData_Init(&storage);
XML_SetUserData(parser, &storage);
XML_SetStartElementHandler(parser, start_element_event_handler);
enum XML_Status status;
// parse the start text
status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
CharData_CheckXMLChars(&storage, XCS("d")); // first element should be done
// try to parse some 'i', but the token isn't finished
status = XML_Parse(parser, iiiiii, fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
CharData_CheckXMLChars(&storage, XCS("d")); // *still* just the first one
// end the <x> token.
status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
CharData_CheckXMLChars(&storage, XCS("d")); // not yet.
// now change the heuristic setting and add *no* data
assert_true(XML_SetReparseDeferralEnabled(parser, XML_FALSE));
// we avoid isFinal=XML_TRUE, because that would force-bypass the heuristic.
status = XML_Parse(parser, post, (int)strlen(post), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
CharData_CheckXMLChars(&storage, XCS("dx"));
XML_ParserFree(parser);
}
END_TEST
START_TEST(test_set_bad_reparse_option) {
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 2));
assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 3));
assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 99));
assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 127));
assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 128));
assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 129));
assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 255));
assert_true(XML_TRUE == XML_SetReparseDeferralEnabled(parser, 0));
assert_true(XML_TRUE == XML_SetReparseDeferralEnabled(parser, 1));
XML_ParserFree(parser);
}
END_TEST
void
make_basic_test_case(Suite *s) {
TCase *tc_basic = tcase_create("basic tests");
@ -5545,4 +5693,8 @@ make_basic_test_case(Suite *s) {
test_pool_integrity_with_unfinished_attr);
tcase_add_test__if_xml_ge(tc_basic, test_nested_entity_suspend);
tcase_add_test(tc_basic, test_big_tokens_take_linear_time);
tcase_add_test(tc_basic, test_set_reparse_deferral);
tcase_add_test(tc_basic, test_set_reparse_deferral_on_null_parser);
tcase_add_test(tc_basic, test_set_reparse_deferral_on_the_fly);
tcase_add_test(tc_basic, test_set_bad_reparse_option);
}