Merge pull request #789 from SonyMobile/partial-token-perf

Speed up parsing of big tokens
This commit is contained in:
Sebastian Pipping 2024-01-30 22:54:37 +01:00 committed by GitHub
commit 34b598c5f5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 883 additions and 94 deletions

View file

@ -53,6 +53,7 @@ XML_SetNotationDeclHandler
XML_SetNotStandaloneHandler
XML_SetParamEntityParsing
XML_SetProcessingInstructionHandler
XML_SetReparseDeferralEnabled
XML_SetReturnNSTriplet
XML_SetSkippedEntityHandler
XML_SetStartCdataSectionHandler

View file

@ -131,6 +131,11 @@ buildlib:
run-benchmark:
$(MAKE) -C tests/benchmark
./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/recset.xml 65535 3
./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_attr.xml 4096 3
./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_cdata.xml 4096 3
./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_comment.xml 4096 3
./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_tag.xml 4096 3
./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_text.xml 4096 3
.PHONY: download-xmlts-zip
download-xmlts-zip:

View file

@ -152,10 +152,11 @@ interface.</p>
</ul>
</li>
<li>
<a href="#billion-laughs">Billion Laughs Attack Protection</a>
<a href="#attack-protection">Attack Protection</a>
<ul>
<li><a href="#XML_SetBillionLaughsAttackProtectionMaximumAmplification">XML_SetBillionLaughsAttackProtectionMaximumAmplification</a></li>
<li><a href="#XML_SetBillionLaughsAttackProtectionActivationThreshold">XML_SetBillionLaughsAttackProtectionActivationThreshold</a></li>
<li><a href="#XML_SetReparseDeferralEnabled">XML_SetReparseDeferralEnabled</a></li>
</ul>
</li>
<li><a href="#miscellaneous">Miscellaneous Functions</a>
@ -2167,11 +2168,7 @@ parse position may be before the beginning of the buffer.</p>
return <code>NULL</code>.</p>
</div>
<h3><a name="billion-laughs">Billion Laughs Attack Protection</a></h3>
<p>The functions in this section configure the built-in
protection against various forms of
<a href="https://en.wikipedia.org/wiki/Billion_laughs_attack">billion laughs attacks</a>.</p>
<h3><a name="attack-protection">Attack Protection</a><a name="billion-laughs"></a></h3>
<h4 id="XML_SetBillionLaughsAttackProtectionMaximumAmplification">XML_SetBillionLaughsAttackProtectionMaximumAmplification</h4>
<pre class="fcndec">
@ -2259,6 +2256,27 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold(XML_Parser p,
</p>
</div>
<h4 id="XML_SetReparseDeferralEnabled">XML_SetReparseDeferralEnabled</h4>
<pre class="fcndec">
/* Added in Expat 2.6.0. */
XML_Bool XMLCALL
XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled);
</pre>
<div class="fcndef">
<p>
Large tokens may require many parse calls before enough data is available for Expat to parse it in full.
If Expat retried parsing the token on every parse call, parsing could take quadratic time.
To avoid this, Expat only retries once a significant amount of new data is available.
This function allows disabling this behavior.
</p>
<p>
The <code>enabled</code> argument should be <code>XML_TRUE</code> or <code>XML_FALSE</code>.
</p>
<p>
Returns <code>XML_TRUE</code> on success, and <code>XML_FALSE</code> on error.
</p>
</div>
<h3><a name="miscellaneous">Miscellaneous functions</a></h3>
<p>The functions in this section either obtain state information from

View file

@ -342,6 +342,16 @@ supports both.
</listitem>
</varlistentry>
<varlistentry>
<term><option>-q</option></term>
<listitem>
<para>
Disable reparse deferral, and allow quadratic parse runtime
on large tokens (default: reparse deferral enabled).
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-r</option></term>
<listitem>

View file

@ -16,6 +16,7 @@
Copyright (c) 2016 Thomas Beutlich <tc@tbeu.de>
Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
Copyright (c) 2022 Thijs Schreijer <thijs@thijsschreijer.nl>
Copyright (c) 2023 Sony Corporation / Snild Dolkow <snild@sony.com>
Licensed under the MIT license:
Permission is hereby granted, free of charge, to any person obtaining
@ -1054,6 +1055,10 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold(
XML_Parser parser, unsigned long long activationThresholdBytes);
#endif
/* Added in Expat 2.6.0. */
XMLPARSEAPI(XML_Bool)
XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled);
/* Expat follows the semantic versioning convention.
See https://semver.org
*/

View file

@ -31,6 +31,7 @@
Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
Copyright (c) 2018 Yury Gribov <tetra2005@gmail.com>
Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
Copyright (c) 2023 Sony Corporation / Snild Dolkow <snild@sony.com>
Licensed under the MIT license:
Permission is hereby granted, free of charge, to any person obtaining
@ -160,6 +161,9 @@ unsigned long long testingAccountingGetCountBytesIndirect(XML_Parser parser);
const char *unsignedCharToPrintable(unsigned char c);
#endif
extern XML_Bool g_reparseDeferralEnabledDefault; // written ONLY in runtests.c
extern unsigned int g_parseAttempts; // used for testing only
#ifdef __cplusplus
}
#endif

View file

@ -77,3 +77,5 @@ EXPORTS
; added with version 2.4.0
@_EXPAT_COMMENT_DTD_OR_GE@ XML_SetBillionLaughsAttackProtectionActivationThreshold @69
@_EXPAT_COMMENT_DTD_OR_GE@ XML_SetBillionLaughsAttackProtectionMaximumAmplification @70
; added with version 2.6.0
XML_SetReparseDeferralEnabled @71

View file

@ -89,6 +89,7 @@
# endif
#endif
#include <stdbool.h>
#include <stddef.h>
#include <string.h> /* memset(), memcpy() */
#include <assert.h>
@ -212,6 +213,8 @@ typedef char ICHAR;
/* Do safe (NULL-aware) pointer arithmetic */
#define EXPAT_SAFE_PTR_DIFF(p, q) (((p) && (q)) ? ((p) - (q)) : 0)
#define EXPAT_MIN(a, b) (((a) < (b)) ? (a) : (b))
#include "internal.h"
#include "xmltok.h"
#include "xmlrole.h"
@ -624,6 +627,9 @@ static unsigned long getDebugLevel(const char *variableName,
? 0 \
: ((*((pool)->ptr)++ = c), 1))
XML_Bool g_reparseDeferralEnabledDefault = XML_TRUE; // write ONLY in runtests.c
unsigned int g_parseAttempts = 0; // used for testing only
struct XML_ParserStruct {
/* The first member must be m_userData so that the XML_GetUserData
macro works. */
@ -647,6 +653,9 @@ struct XML_ParserStruct {
XML_Index m_parseEndByteIndex;
const char *m_parseEndPtr;
size_t m_partialTokenBytesBefore; /* used in heuristic to avoid O(n^2) */
XML_Bool m_reparseDeferralEnabled;
int m_lastBufferRequestSize;
XML_Char *m_dataBuf;
XML_Char *m_dataBufEnd;
XML_StartElementHandler m_startElementHandler;
@ -978,6 +987,47 @@ get_hash_secret_salt(XML_Parser parser) {
return parser->m_hash_secret_salt;
}
static enum XML_Error
callProcessor(XML_Parser parser, const char *start, const char *end,
const char **endPtr) {
const size_t have_now = EXPAT_SAFE_PTR_DIFF(end, start);
if (parser->m_reparseDeferralEnabled
&& ! parser->m_parsingStatus.finalBuffer) {
// Heuristic: don't try to parse a partial token again until the amount of
// available data has increased significantly.
const size_t had_before = parser->m_partialTokenBytesBefore;
// ...but *do* try anyway if we're close to causing a reallocation.
size_t available_buffer
= EXPAT_SAFE_PTR_DIFF(parser->m_bufferPtr, parser->m_buffer);
#if XML_CONTEXT_BYTES > 0
available_buffer -= EXPAT_MIN(available_buffer, XML_CONTEXT_BYTES);
#endif
available_buffer
+= EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd);
// m_lastBufferRequestSize is never assigned a value < 0, so the cast is ok
const bool enough
= (have_now >= 2 * had_before)
|| ((size_t)parser->m_lastBufferRequestSize > available_buffer);
if (! enough) {
*endPtr = start; // callers may expect this to be set
return XML_ERROR_NONE;
}
}
g_parseAttempts += 1;
const enum XML_Error ret = parser->m_processor(parser, start, end, endPtr);
if (ret == XML_ERROR_NONE) {
// if we consumed nothing, remember what we had on this parse attempt.
if (*endPtr == start) {
parser->m_partialTokenBytesBefore = have_now;
} else {
parser->m_partialTokenBytesBefore = 0;
}
}
return ret;
}
static XML_Bool /* only valid for root parser */
startParsing(XML_Parser parser) {
/* hash functions must be initialized before setContext() is called */
@ -1159,6 +1209,9 @@ parserInit(XML_Parser parser, const XML_Char *encodingName) {
parser->m_bufferEnd = parser->m_buffer;
parser->m_parseEndByteIndex = 0;
parser->m_parseEndPtr = NULL;
parser->m_partialTokenBytesBefore = 0;
parser->m_reparseDeferralEnabled = g_reparseDeferralEnabledDefault;
parser->m_lastBufferRequestSize = 0;
parser->m_declElementType = NULL;
parser->m_declAttributeId = NULL;
parser->m_declEntity = NULL;
@ -1328,6 +1381,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser, const XML_Char *context,
to worry which hash secrets each table has.
*/
unsigned long oldhash_secret_salt;
XML_Bool oldReparseDeferralEnabled;
/* Validate the oldParser parameter before we pull everything out of it */
if (oldParser == NULL)
@ -1372,6 +1426,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser, const XML_Char *context,
to worry which hash secrets each table has.
*/
oldhash_secret_salt = parser->m_hash_secret_salt;
oldReparseDeferralEnabled = parser->m_reparseDeferralEnabled;
#ifdef XML_DTD
if (! context)
@ -1424,6 +1479,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser, const XML_Char *context,
parser->m_defaultExpandInternalEntities = oldDefaultExpandInternalEntities;
parser->m_ns_triplets = oldns_triplets;
parser->m_hash_secret_salt = oldhash_secret_salt;
parser->m_reparseDeferralEnabled = oldReparseDeferralEnabled;
parser->m_parentParser = oldParser;
#ifdef XML_DTD
parser->m_paramEntityParsing = oldParamEntityParsing;
@ -1878,55 +1934,8 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
parser->m_parsingStatus.parsing = XML_PARSING;
}
if (len == 0) {
parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;
if (! isFinal)
return XML_STATUS_OK;
parser->m_positionPtr = parser->m_bufferPtr;
parser->m_parseEndPtr = parser->m_bufferEnd;
/* If data are left over from last buffer, and we now know that these
data are the final chunk of input, then we have to check them again
to detect errors based on that fact.
*/
parser->m_errorCode
= parser->m_processor(parser, parser->m_bufferPtr,
parser->m_parseEndPtr, &parser->m_bufferPtr);
if (parser->m_errorCode == XML_ERROR_NONE) {
switch (parser->m_parsingStatus.parsing) {
case XML_SUSPENDED:
/* It is hard to be certain, but it seems that this case
* cannot occur. This code is cleaning up a previous parse
* with no new data (since len == 0). Changing the parsing
* state requires getting to execute a handler function, and
* there doesn't seem to be an opportunity for that while in
* this circumstance.
*
* Given the uncertainty, we retain the code but exclude it
* from coverage tests.
*
* LCOV_EXCL_START
*/
XmlUpdatePosition(parser->m_encoding, parser->m_positionPtr,
parser->m_bufferPtr, &parser->m_position);
parser->m_positionPtr = parser->m_bufferPtr;
return XML_STATUS_SUSPENDED;
/* LCOV_EXCL_STOP */
case XML_INITIALIZED:
case XML_PARSING:
parser->m_parsingStatus.parsing = XML_FINISHED;
/* fall through */
default:
return XML_STATUS_OK;
}
}
parser->m_eventEndPtr = parser->m_eventPtr;
parser->m_processor = errorProcessor;
return XML_STATUS_ERROR;
}
#if XML_CONTEXT_BYTES == 0
else if (parser->m_bufferPtr == parser->m_bufferEnd) {
if (parser->m_bufferPtr == parser->m_bufferEnd) {
const char *end;
int nLeftOver;
enum XML_Status result;
@ -1937,12 +1946,15 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
parser->m_processor = errorProcessor;
return XML_STATUS_ERROR;
}
// though this isn't a buffer request, we assume that `len` is the app's
// preferred buffer fill size, and therefore save it here.
parser->m_lastBufferRequestSize = len;
parser->m_parseEndByteIndex += len;
parser->m_positionPtr = s;
parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;
parser->m_errorCode
= parser->m_processor(parser, s, parser->m_parseEndPtr = s + len, &end);
= callProcessor(parser, s, parser->m_parseEndPtr = s + len, &end);
if (parser->m_errorCode != XML_ERROR_NONE) {
parser->m_eventEndPtr = parser->m_eventPtr;
@ -1975,6 +1987,9 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
parser->m_parsingStatus.parsing = XML_PARSING;
void *const temp = XML_GetBuffer(parser, nLeftOver);
parser->m_parsingStatus.parsing = originalStatus;
// GetBuffer may have overwritten this, but we want to remember what the
// app requested, not how many bytes were left over after parsing.
parser->m_lastBufferRequestSize = len;
if (temp == NULL) {
// NOTE: parser->m_errorCode has already been set by XML_GetBuffer().
parser->m_eventPtr = parser->m_eventEndPtr = NULL;
@ -1996,15 +2011,14 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
return result;
}
#endif /* XML_CONTEXT_BYTES == 0 */
else {
void *buff = XML_GetBuffer(parser, len);
if (buff == NULL)
return XML_STATUS_ERROR;
else {
memcpy(buff, s, len);
return XML_ParseBuffer(parser, len, isFinal);
}
void *buff = XML_GetBuffer(parser, len);
if (buff == NULL)
return XML_STATUS_ERROR;
if (len > 0) {
assert(s != NULL); // make sure s==NULL && len!=0 was rejected above
memcpy(buff, s, len);
}
return XML_ParseBuffer(parser, len, isFinal);
}
enum XML_Status XMLCALL
@ -2044,8 +2058,8 @@ XML_ParseBuffer(XML_Parser parser, int len, int isFinal) {
parser->m_parseEndByteIndex += len;
parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;
parser->m_errorCode = parser->m_processor(
parser, start, parser->m_parseEndPtr, &parser->m_bufferPtr);
parser->m_errorCode = callProcessor(parser, start, parser->m_parseEndPtr,
&parser->m_bufferPtr);
if (parser->m_errorCode != XML_ERROR_NONE) {
parser->m_eventEndPtr = parser->m_eventPtr;
@ -2090,7 +2104,11 @@ XML_GetBuffer(XML_Parser parser, int len) {
default:;
}
if (len > EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd)) {
// whether or not the request succeeds, `len` seems to be the app's preferred
// buffer fill size; remember it.
parser->m_lastBufferRequestSize = len;
if (len > EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd)
|| parser->m_buffer == NULL) {
#if XML_CONTEXT_BYTES > 0
int keep;
#endif /* XML_CONTEXT_BYTES > 0 */
@ -2113,8 +2131,9 @@ XML_GetBuffer(XML_Parser parser, int len) {
}
neededSize += keep;
#endif /* XML_CONTEXT_BYTES > 0 */
if (neededSize
<= EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_buffer)) {
if (parser->m_buffer && parser->m_bufferPtr
&& neededSize
<= EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_buffer)) {
#if XML_CONTEXT_BYTES > 0
if (keep < EXPAT_SAFE_PTR_DIFF(parser->m_bufferPtr, parser->m_buffer)) {
int offset
@ -2128,14 +2147,12 @@ XML_GetBuffer(XML_Parser parser, int len) {
parser->m_bufferPtr -= offset;
}
#else
if (parser->m_buffer && parser->m_bufferPtr) {
memmove(parser->m_buffer, parser->m_bufferPtr,
EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr));
parser->m_bufferEnd
= parser->m_buffer
+ EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr);
parser->m_bufferPtr = parser->m_buffer;
}
memmove(parser->m_buffer, parser->m_bufferPtr,
EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr));
parser->m_bufferEnd
= parser->m_buffer
+ EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr);
parser->m_bufferPtr = parser->m_buffer;
#endif /* XML_CONTEXT_BYTES > 0 */
} else {
char *newBuf;
@ -2237,7 +2254,7 @@ XML_ResumeParser(XML_Parser parser) {
}
parser->m_parsingStatus.parsing = XML_PARSING;
parser->m_errorCode = parser->m_processor(
parser->m_errorCode = callProcessor(
parser, parser->m_bufferPtr, parser->m_parseEndPtr, &parser->m_bufferPtr);
if (parser->m_errorCode != XML_ERROR_NONE) {
@ -2592,6 +2609,15 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold(
}
#endif /* XML_GE == 1 */
XML_Bool XMLCALL
XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled) {
if (parser != NULL && (enabled == XML_TRUE || enabled == XML_FALSE)) {
parser->m_reparseDeferralEnabled = enabled;
return XML_TRUE;
}
return XML_FALSE;
}
/* Initially tag->rawName always points into the parse buffer;
for those TAG instances opened while the current parse buffer was
processed, and not yet closed, we need to store tag->rawName in a more

View file

@ -49,6 +49,7 @@
#include <stdio.h>
#include <string.h>
#include <time.h>
#if ! defined(__cplusplus)
# include <stdbool.h>
@ -2910,6 +2911,20 @@ START_TEST(test_buffer_can_grow_to_max) {
}
END_TEST
START_TEST(test_getbuffer_allocates_on_zero_len) {
for (int first_len = 1; first_len >= 0; first_len--) {
set_subtest("with len=%d first", first_len);
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(parser != NULL);
assert_true(XML_GetBuffer(parser, first_len) != NULL);
assert_true(XML_GetBuffer(parser, 0) != NULL);
if (XML_ParseBuffer(parser, 0, XML_FALSE) != XML_STATUS_OK)
xml_failure(parser);
XML_ParserFree(parser);
}
}
END_TEST
/* Test position information macros */
START_TEST(test_byte_info_at_end) {
const char *text = "<doc></doc>";
@ -3148,7 +3163,7 @@ static int XMLCALL
external_bom_checker(XML_Parser parser, const XML_Char *context,
const XML_Char *base, const XML_Char *systemId,
const XML_Char *publicId) {
const char *text = "";
const char *text;
UNUSED_P(base);
UNUSED_P(systemId);
UNUSED_P(publicId);
@ -3625,7 +3640,9 @@ START_TEST(test_suspend_resume_internal_entity) {
XML_SetStartElementHandler(g_parser, start_element_suspender);
XML_SetCharacterDataHandler(g_parser, accumulate_characters);
XML_SetUserData(g_parser, &storage);
if (_XML_Parse_SINGLE_BYTES(g_parser, text, (int)strlen(text), XML_TRUE)
// can't use SINGLE_BYTES here, because it'll return early on suspension, and
// we won't know exactly how much input we actually managed to give Expat.
if (XML_Parse(g_parser, text, (int)strlen(text), XML_TRUE)
!= XML_STATUS_SUSPENDED)
xml_failure(g_parser);
CharData_CheckXMLChars(&storage, XCS(""));
@ -4638,6 +4655,12 @@ START_TEST(test_utf8_in_start_tags) {
char doc[1024];
size_t failCount = 0;
// we need all the bytes to be parsed, but we don't want the errors that can
// trigger on isFinal=XML_TRUE, so we skip the test if the heuristic is on.
if (g_reparseDeferralEnabledDefault) {
return;
}
for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
size_t j = 0;
for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
@ -5178,6 +5201,629 @@ START_TEST(test_nested_entity_suspend) {
}
END_TEST
/* Regression test for quadratic parsing on large tokens */
START_TEST(test_big_tokens_take_linear_time) {
const char *const too_slow_failure_message
= "Compared to the baseline runtime of the first test, this test has a "
"slowdown of more than <max_slowdown>. "
"Please keep increasing the value by 1 until it reliably passes the "
"test on your hardware and open a bug sharing that number with us. "
"Thanks in advance!";
const struct {
const char *pre;
const char *post;
} text[] = {
{"<a>", "</a>"}, // assumed good, used as baseline
{"<b><![CDATA[ value: ", " ]]></b>"}, // CDATA, performed OK before patch
{"<c attr='", "'></c>"}, // big attribute, used to be O(N²)
{"<d><!-- ", " --></d>"}, // long comment, used to be O(N²)
{"<e><", "/></e>"}, // big elem name, used to be O(N²)
};
const int num_cases = sizeof(text) / sizeof(text[0]);
// For the test we need a <max_slowdown> value that is:
// (1) big enough that the test passes reliably (avoiding flaky tests), and
// (2) small enough that the test actually catches regressions.
const int max_slowdown = 15;
char aaaaaa[4096];
const int fillsize = (int)sizeof(aaaaaa);
const int fillcount = 100;
memset(aaaaaa, 'a', fillsize);
if (! g_reparseDeferralEnabledDefault) {
return; // heuristic is disabled; we would get O(n^2) and fail.
}
#if defined(_WIN32)
if (CLOCKS_PER_SEC < 100000) {
// Skip this test if clock() doesn't have reasonably good resolution.
// This workaround is only applied to Windows targets, since XSI requires
// the value to be 1 000 000 (10x the condition here), and we want to be
// very sure that at least one platform in CI can catch regressions.
return;
}
#endif
clock_t baseline = 0;
for (int i = 0; i < num_cases; ++i) {
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(parser != NULL);
enum XML_Status status;
set_subtest("max_slowdown=%d text=\"%saaaaaa%s\"", max_slowdown,
text[i].pre, text[i].post);
const clock_t start = clock();
// parse the start text
status = _XML_Parse_SINGLE_BYTES(parser, text[i].pre,
(int)strlen(text[i].pre), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
// parse lots of 'a', failing the test early if it takes too long
for (int f = 0; f < fillcount; ++f) {
status = _XML_Parse_SINGLE_BYTES(parser, aaaaaa, fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
// i == 0 means we're still calculating the baseline value
if (i > 0) {
const clock_t now = clock();
const clock_t clocks_so_far = now - start;
const int slowdown = clocks_so_far / baseline;
if (slowdown >= max_slowdown) {
fprintf(
stderr,
"fill#%d: clocks_so_far=%d baseline=%d slowdown=%d max_slowdown=%d\n",
f, (int)clocks_so_far, (int)baseline, slowdown, max_slowdown);
fail(too_slow_failure_message);
}
}
}
// parse the end text
status = _XML_Parse_SINGLE_BYTES(parser, text[i].post,
(int)strlen(text[i].post), XML_TRUE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
// how long did it take in total?
const clock_t end = clock();
const clock_t taken = end - start;
if (i == 0) {
assert_true(taken > 0); // just to make sure we don't div-by-0 later
baseline = taken;
}
const int slowdown = taken / baseline;
if (slowdown >= max_slowdown) {
fprintf(stderr, "taken=%d baseline=%d slowdown=%d max_slowdown=%d\n",
(int)taken, (int)baseline, slowdown, max_slowdown);
fail(too_slow_failure_message);
}
XML_ParserFree(parser);
}
}
END_TEST
START_TEST(test_set_reparse_deferral) {
const char *const pre = "<d>";
const char *const start = "<x attr='";
const char *const end = "'></x>";
char eeeeee[100];
const int fillsize = (int)sizeof(eeeeee);
memset(eeeeee, 'e', fillsize);
for (int enabled = 0; enabled <= 1; enabled += 1) {
set_subtest("deferral=%d", enabled);
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(parser != NULL);
assert_true(XML_SetReparseDeferralEnabled(parser, enabled));
// pre-grow the buffer to avoid reparsing due to almost-fullness
assert_true(XML_GetBuffer(parser, fillsize * 10103) != NULL);
CharData storage;
CharData_Init(&storage);
XML_SetUserData(parser, &storage);
XML_SetStartElementHandler(parser, start_element_event_handler);
enum XML_Status status;
// parse the start text
status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
CharData_CheckXMLChars(&storage, XCS("d")); // first element should be done
// ..and the start of the token
status = XML_Parse(parser, start, (int)strlen(start), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
CharData_CheckXMLChars(&storage, XCS("d")); // still just the first one
// try to parse lots of 'e', but the token isn't finished
for (int c = 0; c < 100; ++c) {
status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
}
CharData_CheckXMLChars(&storage, XCS("d")); // *still* just the first one
// end the <x> token.
status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
if (enabled) {
// In general, we may need to push more data to trigger a reparse attempt,
// but in this test, the data is constructed to always require it.
CharData_CheckXMLChars(&storage, XCS("d")); // or the test is incorrect
// 2x the token length should suffice; the +1 covers the start and end.
for (int c = 0; c < 101; ++c) {
status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
}
}
CharData_CheckXMLChars(&storage, XCS("dx")); // the <x> should be done
XML_ParserFree(parser);
}
}
END_TEST
struct element_decl_data {
XML_Parser parser;
int count;
};
static void
element_decl_counter(void *userData, const XML_Char *name, XML_Content *model) {
UNUSED_P(name);
struct element_decl_data *testdata = (struct element_decl_data *)userData;
testdata->count += 1;
XML_FreeContentModel(testdata->parser, model);
}
static int
external_inherited_parser(XML_Parser p, const XML_Char *context,
const XML_Char *base, const XML_Char *systemId,
const XML_Char *publicId) {
UNUSED_P(base);
UNUSED_P(systemId);
UNUSED_P(publicId);
const char *const pre = "<!ELEMENT document ANY>\n";
const char *const start = "<!ELEMENT ";
const char *const end = " ANY>\n";
const char *const post = "<!ELEMENT xyz ANY>\n";
const int enabled = *(int *)XML_GetUserData(p);
char eeeeee[100];
char spaces[100];
const int fillsize = (int)sizeof(eeeeee);
assert_true(fillsize == (int)sizeof(spaces));
memset(eeeeee, 'e', fillsize);
memset(spaces, ' ', fillsize);
XML_Parser parser = XML_ExternalEntityParserCreate(p, context, NULL);
assert_true(parser != NULL);
// pre-grow the buffer to avoid reparsing due to almost-fullness
assert_true(XML_GetBuffer(parser, fillsize * 10103) != NULL);
struct element_decl_data testdata;
testdata.parser = parser;
testdata.count = 0;
XML_SetUserData(parser, &testdata);
XML_SetElementDeclHandler(parser, element_decl_counter);
enum XML_Status status;
// parse the initial text
status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
assert_true(testdata.count == 1); // first element should be done
// ..and the start of the big token
status = XML_Parse(parser, start, (int)strlen(start), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
assert_true(testdata.count == 1); // still just the first one
// try to parse lots of 'e', but the token isn't finished
for (int c = 0; c < 100; ++c) {
status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
}
assert_true(testdata.count == 1); // *still* just the first one
// end the big token.
status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
if (enabled) {
// In general, we may need to push more data to trigger a reparse attempt,
// but in this test, the data is constructed to always require it.
assert_true(testdata.count == 1); // or the test is incorrect
// 2x the token length should suffice; the +1 covers the start and end.
for (int c = 0; c < 101; ++c) {
status = XML_Parse(parser, spaces, fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
}
}
assert_true(testdata.count == 2); // the big token should be done
// parse the final text
status = XML_Parse(parser, post, (int)strlen(post), XML_TRUE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
assert_true(testdata.count == 3); // after isFinal=XML_TRUE, all must be done
XML_ParserFree(parser);
return XML_STATUS_OK;
}
START_TEST(test_reparse_deferral_is_inherited) {
const char *const text
= "<!DOCTYPE document SYSTEM 'something.ext'><document/>";
for (int enabled = 0; enabled <= 1; ++enabled) {
set_subtest("deferral=%d", enabled);
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(parser != NULL);
XML_SetUserData(parser, (void *)&enabled);
XML_SetParamEntityParsing(parser, XML_PARAM_ENTITY_PARSING_ALWAYS);
// this handler creates a sub-parser and checks that its deferral behavior
// is what we expected, based on the value of `enabled` (in userdata).
XML_SetExternalEntityRefHandler(parser, external_inherited_parser);
assert_true(XML_SetReparseDeferralEnabled(parser, enabled));
if (XML_Parse(parser, text, (int)strlen(text), XML_TRUE) != XML_STATUS_OK)
xml_failure(parser);
XML_ParserFree(parser);
}
}
END_TEST
START_TEST(test_set_reparse_deferral_on_null_parser) {
assert_true(XML_SetReparseDeferralEnabled(NULL, 0) == XML_FALSE);
assert_true(XML_SetReparseDeferralEnabled(NULL, 1) == XML_FALSE);
assert_true(XML_SetReparseDeferralEnabled(NULL, 10) == XML_FALSE);
assert_true(XML_SetReparseDeferralEnabled(NULL, 100) == XML_FALSE);
assert_true(XML_SetReparseDeferralEnabled(NULL, (XML_Bool)INT_MIN)
== XML_FALSE);
assert_true(XML_SetReparseDeferralEnabled(NULL, (XML_Bool)INT_MAX)
== XML_FALSE);
}
END_TEST
START_TEST(test_set_reparse_deferral_on_the_fly) {
const char *const pre = "<d><x attr='";
const char *const end = "'></x>";
char iiiiii[100];
const int fillsize = (int)sizeof(iiiiii);
memset(iiiiii, 'i', fillsize);
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(parser != NULL);
assert_true(XML_SetReparseDeferralEnabled(parser, XML_TRUE));
CharData storage;
CharData_Init(&storage);
XML_SetUserData(parser, &storage);
XML_SetStartElementHandler(parser, start_element_event_handler);
enum XML_Status status;
// parse the start text
status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
CharData_CheckXMLChars(&storage, XCS("d")); // first element should be done
// try to parse some 'i', but the token isn't finished
status = XML_Parse(parser, iiiiii, fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
CharData_CheckXMLChars(&storage, XCS("d")); // *still* just the first one
// end the <x> token.
status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
CharData_CheckXMLChars(&storage, XCS("d")); // not yet.
// now change the heuristic setting and add *no* data
assert_true(XML_SetReparseDeferralEnabled(parser, XML_FALSE));
// we avoid isFinal=XML_TRUE, because that would force-bypass the heuristic.
status = XML_Parse(parser, "", 0, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
CharData_CheckXMLChars(&storage, XCS("dx"));
XML_ParserFree(parser);
}
END_TEST
START_TEST(test_set_bad_reparse_option) {
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 2));
assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 3));
assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 99));
assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 127));
assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 128));
assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 129));
assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 255));
assert_true(XML_TRUE == XML_SetReparseDeferralEnabled(parser, 0));
assert_true(XML_TRUE == XML_SetReparseDeferralEnabled(parser, 1));
XML_ParserFree(parser);
}
END_TEST
static size_t g_totalAlloc = 0;
static size_t g_biggestAlloc = 0;
static void *
counting_realloc(void *ptr, size_t size) {
g_totalAlloc += size;
if (size > g_biggestAlloc) {
g_biggestAlloc = size;
}
return realloc(ptr, size);
}
static void *
counting_malloc(size_t size) {
return counting_realloc(NULL, size);
}
START_TEST(test_bypass_heuristic_when_close_to_bufsize) {
if (g_chunkSize != 0) {
// this test does not use SINGLE_BYTES, because it depends on very precise
// buffer fills.
return;
}
if (! g_reparseDeferralEnabledDefault) {
return; // this test is irrelevant when the deferral heuristic is disabled.
}
const int document_length = 65536;
char *const document = (char *)malloc(document_length);
const XML_Memory_Handling_Suite memfuncs = {
counting_malloc,
counting_realloc,
free,
};
const int leading_list[] = {0, 3, 61, 96, 400, 401, 4000, 4010, 4099, -1};
const int bigtoken_list[] = {3000, 4000, 4001, 4096, 4099, 5000, 20000, -1};
const int fillsize_list[] = {131, 256, 399, 400, 401, 1025, 4099, 4321, -1};
for (const int *leading = leading_list; *leading >= 0; leading++) {
for (const int *bigtoken = bigtoken_list; *bigtoken >= 0; bigtoken++) {
for (const int *fillsize = fillsize_list; *fillsize >= 0; fillsize++) {
set_subtest("leading=%d bigtoken=%d fillsize=%d", *leading, *bigtoken,
*fillsize);
// start by checking that the test looks reasonably valid
assert_true(*leading + *bigtoken <= document_length);
// put 'x' everywhere; some will be overwritten by elements.
memset(document, 'x', document_length);
// maybe add an initial tag
if (*leading) {
assert_true(*leading >= 3); // or the test case is invalid
memcpy(document, "<a>", 3);
}
// add the large token
document[*leading + 0] = '<';
document[*leading + 1] = 'b';
memset(&document[*leading + 2], ' ', *bigtoken - 2); // a spacy token
document[*leading + *bigtoken - 1] = '>';
// 1 for 'b', plus 1 or 0 depending on the presence of 'a'
const int expected_elem_total = 1 + (*leading ? 1 : 0);
XML_Parser parser = XML_ParserCreate_MM(NULL, &memfuncs, NULL);
assert_true(parser != NULL);
CharData storage;
CharData_Init(&storage);
XML_SetUserData(parser, &storage);
XML_SetStartElementHandler(parser, start_element_event_handler);
g_biggestAlloc = 0;
g_totalAlloc = 0;
int offset = 0;
// fill data until the big token is covered (but not necessarily parsed)
while (offset < *leading + *bigtoken) {
assert_true(offset + *fillsize <= document_length);
const enum XML_Status status
= XML_Parse(parser, &document[offset], *fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
offset += *fillsize;
}
// Now, check that we've had a buffer allocation that could fit the
// context bytes and our big token. In order to detect a special case,
// we need to know how many bytes of our big token were included in the
// first push that contained _any_ bytes of the big token:
const int bigtok_first_chunk_bytes = *fillsize - (*leading % *fillsize);
if (bigtok_first_chunk_bytes >= *bigtoken && XML_CONTEXT_BYTES == 0) {
// Special case: we aren't saving any context, and the whole big token
// was covered by a single fill, so Expat may have parsed directly
// from our input pointer, without allocating an internal buffer.
} else if (*leading < XML_CONTEXT_BYTES) {
assert_true(g_biggestAlloc >= *leading + (size_t)*bigtoken);
} else {
assert_true(g_biggestAlloc >= XML_CONTEXT_BYTES + (size_t)*bigtoken);
}
// fill data until the big token is actually parsed
while (storage.count < expected_elem_total) {
const size_t alloc_before = g_totalAlloc;
assert_true(offset + *fillsize <= document_length);
const enum XML_Status status
= XML_Parse(parser, &document[offset], *fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
offset += *fillsize;
// since all the bytes of the big token are already in the buffer,
// the bufsize ceiling should make us finish its parsing without any
// further buffer allocations. We assume that there will be no other
// large allocations in this test.
assert_true(g_totalAlloc - alloc_before < 4096);
}
// test-the-test: was our alloc even called?
assert_true(g_totalAlloc > 0);
// test-the-test: there shouldn't be any extra start elements
assert_true(storage.count == expected_elem_total);
XML_ParserFree(parser);
}
}
}
free(document);
}
END_TEST
START_TEST(test_varying_buffer_fills) {
const int KiB = 1024;
const int MiB = 1024 * KiB;
const int document_length = 16 * MiB;
const int big = 7654321; // arbitrarily chosen between 4 and 8 MiB
if (g_chunkSize != 0) {
return; // this test is slow, and doesn't use _XML_Parse_SINGLE_BYTES().
}
char *const document = (char *)malloc(document_length);
assert_true(document != NULL);
memset(document, 'x', document_length);
document[0] = '<';
document[1] = 't';
memset(&document[2], ' ', big - 2); // a very spacy token
document[big - 1] = '>';
// Each testcase is a list of buffer fill sizes, terminated by a value < 0.
// When reparse deferral is enabled, the final (negated) value is the expected
// maximum number of bytes scanned in parse attempts.
const int testcases[][30] = {
{8 * MiB, -8 * MiB},
{4 * MiB, 4 * MiB, -12 * MiB}, // try at 4MB, then 8MB = 12 MB total
// zero-size fills shouldn't trigger the bypass
{4 * MiB, 0, 4 * MiB, -12 * MiB},
{4 * MiB, 0, 0, 4 * MiB, -12 * MiB},
{4 * MiB, 0, 1 * MiB, 0, 3 * MiB, -12 * MiB},
// try to hit the buffer ceiling only once (at the end)
{4 * MiB, 2 * MiB, 1 * MiB, 512 * KiB, 256 * KiB, 256 * KiB, -12 * MiB},
// try to hit the same buffer ceiling multiple times
{4 * MiB + 1, 2 * MiB, 1 * MiB, 512 * KiB, -25 * MiB},
// try to hit every ceiling, by always landing 1K shy of the buffer size
{1 * KiB, 2 * KiB, 4 * KiB, 8 * KiB, 16 * KiB, 32 * KiB, 64 * KiB,
128 * KiB, 256 * KiB, 512 * KiB, 1 * MiB, 2 * MiB, 4 * MiB, -16 * MiB},
// try to avoid every ceiling, by always landing 1B past the buffer size
// the normal 2x heuristic threshold still forces parse attempts.
{2 * KiB + 1, // will attempt 2KiB + 1 ==> total 2KiB + 1
2 * KiB, 4 * KiB, // will attempt 8KiB + 1 ==> total 10KiB + 2
8 * KiB, 16 * KiB, // will attempt 32KiB + 1 ==> total 42KiB + 3
32 * KiB, 64 * KiB, // will attempt 128KiB + 1 ==> total 170KiB + 4
128 * KiB, 256 * KiB, // will attempt 512KiB + 1 ==> total 682KiB + 5
512 * KiB, 1 * MiB, // will attempt 2MiB + 1 ==> total 2M + 682K + 6
2 * MiB, 4 * MiB, // will attempt 8MiB + 1 ==> total 10M + 682K + 7
-(10 * MiB + 682 * KiB + 7)},
// try to avoid every ceiling again, except on our last fill.
{2 * KiB + 1, // will attempt 2KiB + 1 ==> total 2KiB + 1
2 * KiB, 4 * KiB, // will attempt 8KiB + 1 ==> total 10KiB + 2
8 * KiB, 16 * KiB, // will attempt 32KiB + 1 ==> total 42KiB + 3
32 * KiB, 64 * KiB, // will attempt 128KiB + 1 ==> total 170KiB + 4
128 * KiB, 256 * KiB, // will attempt 512KiB + 1 ==> total 682KiB + 5
512 * KiB, 1 * MiB, // will attempt 2MiB + 1 ==> total 2M + 682K + 6
2 * MiB, 4 * MiB - 1, // will attempt 8MiB ==> total 10M + 682K + 6
-(10 * MiB + 682 * KiB + 6)},
// try to hit ceilings on the way multiple times
{512 * KiB + 1, 256 * KiB, 128 * KiB, 128 * KiB - 1, // 1 MiB buffer
512 * KiB + 1, 256 * KiB, 128 * KiB, 128 * KiB - 1, // 2 MiB buffer
1 * MiB + 1, 512 * KiB, 256 * KiB, 256 * KiB - 1, // 4 MiB buffer
2 * MiB + 1, 1 * MiB, 512 * KiB, // 8 MiB buffer
// we'll make a parse attempt at every parse call
-(45 * MiB + 12)},
};
const int testcount = sizeof(testcases) / sizeof(testcases[0]);
for (int test_i = 0; test_i < testcount; test_i++) {
const int *fillsize = testcases[test_i];
set_subtest("#%d {%d %d %d %d ...}", test_i, fillsize[0], fillsize[1],
fillsize[2], fillsize[3]);
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(parser != NULL);
g_parseAttempts = 0;
CharData storage;
CharData_Init(&storage);
XML_SetUserData(parser, &storage);
XML_SetStartElementHandler(parser, start_element_event_handler);
int worstcase_bytes = 0; // sum of (buffered bytes at each XML_Parse call)
int scanned_bytes = 0; // sum of (buffered bytes at each actual parse)
int offset = 0;
while (*fillsize >= 0) {
assert_true(offset + *fillsize <= document_length); // or test is invalid
const unsigned attempts_before = g_parseAttempts;
const enum XML_Status status
= XML_Parse(parser, &document[offset], *fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
offset += *fillsize;
fillsize++;
assert_true(offset <= INT_MAX - worstcase_bytes); // avoid overflow
worstcase_bytes += offset; // we might've tried to parse all pending bytes
if (g_parseAttempts != attempts_before) {
assert_true(g_parseAttempts == attempts_before + 1); // max 1/XML_Parse
assert_true(offset <= INT_MAX - scanned_bytes); // avoid overflow
scanned_bytes += offset; // we *did* try to parse all pending bytes
}
}
assert_true(storage.count == 1); // the big token should've been parsed
assert_true(scanned_bytes > 0); // test-the-test: does our counter work?
if (g_reparseDeferralEnabledDefault) {
// heuristic is enabled; some XML_Parse calls may have deferred reparsing
const int max_bytes_scanned = -*fillsize;
if (scanned_bytes > max_bytes_scanned) {
fprintf(stderr,
"bytes scanned in parse attempts: actual=%d limit=%d \n",
scanned_bytes, max_bytes_scanned);
fail("too many bytes scanned in parse attempts");
}
assert_true(scanned_bytes <= worstcase_bytes);
} else {
// heuristic is disabled; every XML_Parse() will have reparsed
assert_true(scanned_bytes == worstcase_bytes);
}
XML_ParserFree(parser);
}
free(document);
}
END_TEST
void
make_basic_test_case(Suite *s) {
TCase *tc_basic = tcase_create("basic tests");
@ -5299,6 +5945,7 @@ make_basic_test_case(Suite *s) {
tcase_add_test(tc_basic, test_get_buffer_3_overflow);
#endif
tcase_add_test(tc_basic, test_buffer_can_grow_to_max);
tcase_add_test(tc_basic, test_getbuffer_allocates_on_zero_len);
tcase_add_test(tc_basic, test_byte_info_at_end);
tcase_add_test(tc_basic, test_byte_info_at_error);
tcase_add_test(tc_basic, test_byte_info_at_cdata);
@ -5417,4 +6064,12 @@ make_basic_test_case(Suite *s) {
tcase_add_test__ifdef_xml_dtd(tc_basic,
test_pool_integrity_with_unfinished_attr);
tcase_add_test__if_xml_ge(tc_basic, test_nested_entity_suspend);
tcase_add_test(tc_basic, test_big_tokens_take_linear_time);
tcase_add_test(tc_basic, test_set_reparse_deferral);
tcase_add_test(tc_basic, test_reparse_deferral_is_inherited);
tcase_add_test(tc_basic, test_set_reparse_deferral_on_null_parser);
tcase_add_test(tc_basic, test_set_reparse_deferral_on_the_fly);
tcase_add_test(tc_basic, test_set_bad_reparse_option);
tcase_add_test(tc_basic, test_bypass_heuristic_when_close_to_bufsize);
tcase_add_test(tc_basic, test_varying_buffer_fills);
}

View file

@ -185,7 +185,7 @@ _xml_failure(XML_Parser parser, const char *file, int line) {
"u, offset %" XML_FMT_INT_MOD "u)\n reported from %s, line %d\n",
err, XML_ErrorString(err), XML_GetCurrentLineNumber(parser),
XML_GetCurrentColumnNumber(parser), file, line);
_assert_true(0, file, line, buffer);
_fail(file, line, buffer);
}
enum XML_Status
@ -214,9 +214,9 @@ _expect_failure(const char *text, enum XML_Error errorCode,
const char *errorMessage, const char *file, int lineno) {
if (_XML_Parse_SINGLE_BYTES(g_parser, text, (int)strlen(text), XML_TRUE)
== XML_STATUS_OK)
/* Hackish use of _assert_true() macro, but let's us report
/* Hackish use of _fail() macro, but lets us report
the right filename and line number. */
_assert_true(0, file, lineno, errorMessage);
_fail(file, lineno, errorMessage);
if (XML_GetErrorCode(g_parser) != errorCode)
_xml_failure(g_parser, file, lineno);
}

View file

@ -1717,7 +1717,9 @@ record_element_end_handler(void *userData, const XML_Char *name) {
const struct handler_record_entry *
_handler_record_get(const struct handler_record_list *storage, int index,
const char *file, int line) {
_assert_true(storage->count > index, file, line, "too few handler calls");
if (storage->count <= index) {
_fail(file, line, "too few handler calls");
}
return &storage->entries[index];
}

View file

@ -244,14 +244,11 @@ srunner_summarize(SRunner *runner, int verbosity) {
}
void
_assert_true(int condition, const char *file, int line, const char *msg) {
_fail(const char *file, int line, const char *msg) {
/* Always print the error message so it isn't lost. In this case,
we have a failure, so there's no reason to be quiet about what
it is.
*/
if (condition) {
return;
}
_check_current_filename = file;
_check_current_lineno = line;
if (msg != NULL) {

View file

@ -83,9 +83,13 @@ extern "C" {
void PRINTF_LIKE(1, 2) set_subtest(char const *fmt, ...);
# define fail(msg) _assert_true(0, __FILE__, __LINE__, msg)
# define fail(msg) _fail(__FILE__, __LINE__, msg)
# define assert_true(cond) \
_assert_true((cond), __FILE__, __LINE__, "check failed: " #cond)
do { \
if (! (cond)) { \
_fail(__FILE__, __LINE__, "check failed: " #cond); \
} \
} while (0)
typedef void (*tcase_setup_function)(void);
typedef void (*tcase_teardown_function)(void);
@ -124,7 +128,11 @@ void _check_set_test_info(char const *function, char const *filename,
* Prototypes for the actual implementation.
*/
void _assert_true(int condition, const char *file, int line, const char *msg);
# if defined(__GNUC__)
__attribute__((noreturn))
# endif
void
_fail(const char *file, int line, const char *msg);
Suite *suite_create(const char *name);
TCase *tcase_create(const char *name);
void suite_add_tcase(Suite *suite, TCase *tc);

View file

@ -98,10 +98,14 @@ main(int argc, char *argv[]) {
printf("Expat version: %" XML_FMT_STR "\n", XML_ExpatVersion());
for (g_chunkSize = 0; g_chunkSize <= 5; g_chunkSize++) {
char context[100];
snprintf(context, sizeof(context), "chunksize=%d", g_chunkSize);
context[sizeof(context) - 1] = '\0';
srunner_run_all(sr, context, verbosity);
for (int enabled = 0; enabled <= 1; ++enabled) {
char context[100];
g_reparseDeferralEnabledDefault = enabled;
snprintf(context, sizeof(context), "chunksize=%d deferral=%d",
g_chunkSize, enabled);
context[sizeof(context) - 1] = '\0';
srunner_run_all(sr, context, verbosity);
}
}
srunner_summarize(sr, verbosity);
nf = srunner_ntests_failed(sr);

View file

@ -918,6 +918,9 @@ usage(const XML_Char *prog, int rc) {
T(" -a FACTOR set maximum tolerated [a]mplification factor (default: 100.0)\n")
T(" -b BYTES set number of output [b]ytes needed to activate (default: 8 MiB)\n")
T("\n")
T("reparse deferral:\n")
T(" -q disable reparse deferral, and allow [q]uadratic parse runtime with large tokens\n")
T("\n")
T("info arguments:\n")
T(" -h, --help show this [h]elp message and exit\n")
T(" -v, --version show program's [v]ersion number and exit\n")
@ -973,6 +976,8 @@ tmain(int argc, XML_Char **argv) {
unsigned long long attackThresholdBytes = 0;
XML_Bool attackThresholdGiven = XML_FALSE;
XML_Bool disableDeferral = XML_FALSE;
int exitCode = XMLWF_EXIT_SUCCESS;
enum XML_ParamEntityParsing paramEntityParsing
= XML_PARAM_ENTITY_PARSING_NEVER;
@ -1125,6 +1130,11 @@ tmain(int argc, XML_Char **argv) {
#endif
break;
}
case T('q'): {
disableDeferral = XML_TRUE;
j++;
break;
}
case T('\0'):
if (j > 1) {
i++;
@ -1171,6 +1181,16 @@ tmain(int argc, XML_Char **argv) {
#endif
}
if (disableDeferral) {
const XML_Bool success = XML_SetReparseDeferralEnabled(parser, XML_FALSE);
if (! success) {
// This prevents tperror(..) from reporting misleading "[..]: Success"
errno = EINVAL;
tperror(T("Failed to disable reparse deferral"));
exit(XMLWF_EXIT_INTERNAL_ERROR);
}
}
if (requireStandalone)
XML_SetNotStandaloneHandler(parser, notStandalone);
XML_SetParamEntityParsing(parser, paramEntityParsing);

View file

@ -82,6 +82,10 @@ billion_laughs.add_argument('-a', metavar='FACTOR',
help='set maximum tolerated [a]mplification factor (default: 100.0)')
billion_laughs.add_argument('-b', metavar='BYTES', help='set number of output [b]ytes needed to activate (default: 8 MiB)')
reparse_deferral = parser.add_argument_group('reparse deferral')
reparse_deferral.add_argument('-q', metavar='FACTOR',
help='disable reparse deferral, and allow [q]uadratic parse runtime with large tokens')
parser.add_argument('files', metavar='FILE', nargs='*', help='file to process (default: STDIN)')
info = parser.add_argument_group('info arguments')

View file

@ -31,4 +31,27 @@ resulting measurements tell us.)
utility, specifically for testing the duplicate attribute check in
storeAttributes()
* aaaaaa_attr.xml (~10 MB):
- properties: trivial file with a huge attribute value
- source: generated by a simple shell script
- purpose: performance/regression test
* aaaaaa_cdata.xml (~10 MB):
- properties: trivial file with huge cdata content
- source: generated by a simple shell script
- purpose: performance/regression test
* aaaaaa_comment.xml (~10 MB):
- properties: trivial file with a huge comment
- source: generated by a simple shell script
- purpose: performance/regression test
* aaaaaa_tag.xml (~10 MB):
- properties: trivial file with a huge tag name
- source: generated by a simple shell script
- purpose: performance/regression test
* aaaaaa_text.xml (~10 MB):
- properties: trivial file with a huge text segment (no newlines)
- source: generated by a simple shell script
- purpose: performance/regression test

1
testdata/largefiles/aaaaaa_attr.xml vendored Normal file

File diff suppressed because one or more lines are too long

1
testdata/largefiles/aaaaaa_cdata.xml vendored Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

1
testdata/largefiles/aaaaaa_tag.xml vendored Normal file

File diff suppressed because one or more lines are too long

1
testdata/largefiles/aaaaaa_text.xml vendored Normal file

File diff suppressed because one or more lines are too long