Bypass partial token heuristic when nearing full buffer

...instead of only when approaching the maximum buffer size INT/2+1.

We'd like to give applications a chance to finish parsing a large token
before buffer reallocation, in case the reallocation fails.

By bypassing the reparse deferral heuristic when getting close to the
filling the buffer, we give them this chance -- if the whole token is
present in the buffer, it will be parsed at that time.

This may come at the cost of some extra reparse attempts. For a token
of n bytes, these extra parses cause us to scan over a maximum of
2n bytes (... + n/8 + n/4 + n/2 + n). Therefore, parsing of big tokens
remains O(n) in regard how many bytes we scan in attempts to parse. The
cost in reality is lower than that, since the reparses that happen due
to the bypass will affect m_partialTokenBytesBefore, delaying the next
ratio-based reparse. Furthermore, only the first token that "breaks
through" a buffer ceiling takes that extra reparse attempt; subsequent
large tokens will only bypass the heuristic if they manage to hit the
new buffer ceiling.

Note that this cost analysis depends on the assumption that Expat grows
its buffer by doubling it (or, more generally, grows it exponentially).
If this changes, the cost of this bypass may increase. Hopefully, this
would be caught by test_big_tokens_take_linear_time or the new test.

The bypass logic assumes that the application uses a consistent fill.
If the app increases its fill size, it may miss the bypass (and the
normal heuristic will apply). If the app decreases its fill size, the
bypass may be hit multiple times for the same buffer size. The very
worst case would be to always fill half of the remaining buffer space,
in which case parsing of a large n-byte token becomes O(n log n).

As an added bonus, the new test case should be faster than the old one,
since it doesn't have to go all the way to 1GiB to check the behavior.

Finally, this change necessitated a small modification to two existing
tests related to reparse deferral. These tests are testing the deferral
enabled setting, and assume that reparsing will not happen for any other
reason. By pre-growing the buffer, we make sure that this new deferral
does not affect those test cases.
This commit is contained in:
Snild Dolkow 2023-11-20 16:11:24 +01:00
parent 60b7420989
commit 3d8141d26a
2 changed files with 127 additions and 84 deletions

View file

@ -996,18 +996,18 @@ callProcessor(XML_Parser parser, const char *start, const char *end,
// Heuristic: don't try to parse a partial token again until the amount of
// available data has increased significantly.
const size_t had_before = parser->m_partialTokenBytesBefore;
// ...but *do* try anyway if we're close to reaching the max buffer size.
size_t close_to_maxbuf = INT_MAX / 2 + (INT_MAX & 1); // round up
// ...but *do* try anyway if we're close to causing a reallocation.
size_t available_buffer
= EXPAT_SAFE_PTR_DIFF(parser->m_bufferPtr, parser->m_buffer);
#if XML_CONTEXT_BYTES > 0
// subtract XML_CONTEXT_BYTES, but don't go below zero
close_to_maxbuf -= EXPAT_MIN(close_to_maxbuf, XML_CONTEXT_BYTES);
available_buffer -= EXPAT_MIN(available_buffer, XML_CONTEXT_BYTES);
#endif
// subtract the last buffer fill size, but don't go below zero
available_buffer
+= EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd);
// m_lastBufferRequestSize is never assigned a value < 0, so the cast is ok
close_to_maxbuf
-= EXPAT_MIN(close_to_maxbuf, (size_t)parser->m_lastBufferRequestSize);
const bool enough
= (have_now >= 2 * had_before) || (have_now > close_to_maxbuf);
= (have_now >= 2 * had_before)
|| ((size_t)parser->m_lastBufferRequestSize > available_buffer);
if (! enough) {
*endPtr = start; // callers may expect this to be set

View file

@ -5318,6 +5318,8 @@ START_TEST(test_set_reparse_deferral) {
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(parser != NULL);
assert_true(XML_SetReparseDeferralEnabled(parser, enabled));
// pre-grow the buffer to avoid reparsing due to almost-fullness
assert_true(XML_GetBuffer(parser, fillsize * 10103) != NULL);
CharData storage;
CharData_Init(&storage);
@ -5407,6 +5409,8 @@ external_inherited_parser(XML_Parser p, const XML_Char *context,
XML_Parser parser = XML_ExternalEntityParserCreate(p, context, NULL);
assert_true(parser != NULL);
// pre-grow the buffer to avoid reparsing due to almost-fullness
assert_true(XML_GetBuffer(parser, fillsize * 10103) != NULL);
struct element_decl_data testdata;
testdata.parser = parser;
@ -5569,92 +5573,131 @@ START_TEST(test_set_bad_reparse_option) {
}
END_TEST
START_TEST(test_bypass_heuristic_when_close_to_maxbuf) {
/* There used to be a test here, but one of its dependencies was removed in
a rebase. Since it will be replaced by test_..._when_close_to_bufsize() in
the next commit, it was not worth fixing.
static size_t g_totalAlloc = 0;
static size_t g_biggestAlloc = 0;
// this test is slow; avoid running it multiple times for no reason.
static void *
counting_realloc(void *ptr, size_t size) {
g_totalAlloc += size;
if (size > g_biggestAlloc) {
g_biggestAlloc = size;
}
return realloc(ptr, size);
}
static void *
counting_malloc(size_t size) {
return counting_realloc(NULL, size);
}
START_TEST(test_bypass_heuristic_when_close_to_bufsize) {
if (g_chunkSize != 0) {
return; // we don't use SINGLE_BYTES
// this test does not use SINGLE_BYTES, because it depends on very precise
// buffer fills.
return;
}
if (! g_reparseDeferralEnabledDefault) {
return; // this test is irrelevant when the deferral heuristic is disabled.
}
const int maxbuf = INT_MAX / 2 + (INT_MAX & 1); // round up without overflow
const int fillsize = 1024 * 1024;
// we expect to be able to fill this many times, and no more.
// For example, in the common case of INT_MAX == 2³¹-1:
// * maxbuf will be exactly 1 GiB (1024 * 1024 * 1024 bytes)
// * that means Expat should be able to handle 1024 fills
// * ...but XML_CONTEXT_BYTES can steal some of it from us.
const int expected_fills = (maxbuf - XML_CONTEXT_BYTES) / fillsize;
// Just to make sure the test isn't completely broken, check that
// expected_fills is reasonable for a common setup where int is at
// least 32 bits, and XML_CONTEXT_BYTES is no more than 2 MiB.
if (sizeof(int) >= 4 && XML_CONTEXT_BYTES <= 2 * fillsize) {
assert_true(expected_fills >= 1022);
}
const int document_length = 65536;
char *const document = (char *)malloc(document_length);
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(parser != NULL);
// make the deferral heuristic's threshold grow *extremely* quickly.
assert_true(XML_SetReparseDeferralRatio(parser, (float)INT_MAX));
const XML_Memory_Handling_Suite memfuncs = {
counting_malloc,
counting_realloc,
free,
};
// first fill, will push the heuristic threshold beyond the max buffer size
{
set_subtest("first fill");
char *const buf = (char *)XML_GetBuffer(parser, fillsize);
assert_true(buf != NULL);
memset(buf, 'a', fillsize);
buf[0] = '<';
if (XML_ParseBuffer(parser, fillsize, XML_FALSE) != XML_STATUS_OK)
xml_failure(parser);
}
// second fill, with data that is not well-formed
{
set_subtest("second fill");
char *const buf = (char *)XML_GetBuffer(parser, fillsize);
assert_true(buf != NULL);
strcpy(buf, "></wrongend>"); // leaving the rest of the bytes uninitialized
// the heuristic should defer parsing, so the error is not reported yet
if (XML_ParseBuffer(parser, fillsize, XML_FALSE) != XML_STATUS_OK)
xml_failure(parser);
}
// lots more fills, with uninitialized data (so the test goes fast)
// the 3 here is for the hardcoded "first"/"second"/"last" fills.
for (int fill = 3; fill < expected_fills; ++fill) {
set_subtest("fill #%d", fill);
void *const buf = XML_GetBuffer(parser, fillsize);
if (buf == NULL) {
XML_ParserFree(parser); // avoid leaking our many-MiB parser
#if defined(_WIN32) && ! defined(_WIN64)
// workaround for win[e]32 on GitHub CI not being able to reach 1GiB
return;
#else
fail("buffer is NULL");
#endif
const int leading_list[] = {0, 3, 61, 96, 400, 401, 4000, 4010, 4099, -1};
const int bigtoken_list[] = {3000, 4000, 4001, 4096, 4099, 5000, 20000, -1};
const int fillsize_list[] = {131, 256, 399, 400, 401, 1025, 4099, 4321, -1};
for (const int *leading = leading_list; *leading >= 0; leading++) {
for (const int *bigtoken = bigtoken_list; *bigtoken >= 0; bigtoken++) {
for (const int *fillsize = fillsize_list; *fillsize >= 0; fillsize++) {
set_subtest("leading=%d bigtoken=%d fillsize=%d", *leading, *bigtoken,
*fillsize);
// start by checking that the test looks reasonably valid
assert_true(*leading + *bigtoken <= document_length);
// put 'x' everywhere; some will be overwritten by elements.
memset(document, 'x', document_length);
// maybe add an initial tag
if (*leading) {
assert_true(*leading >= 3); // or the test case is invalid
memcpy(document, "<a>", 3);
}
// add the large token
document[*leading + 0] = '<';
document[*leading + 1] = 'b';
memset(&document[*leading + 2], ' ', *bigtoken - 2); // a spacy token
document[*leading + *bigtoken - 1] = '>';
// 1 for 'b', plus 1 or 0 depending on the presence of 'a'
const int expected_elem_total = 1 + (*leading ? 1 : 0);
XML_Parser parser = XML_ParserCreate_MM(NULL, &memfuncs, NULL);
assert_true(parser != NULL);
CharData storage;
CharData_Init(&storage);
XML_SetUserData(parser, &storage);
XML_SetStartElementHandler(parser, start_element_event_handler);
g_biggestAlloc = 0;
g_totalAlloc = 0;
int offset = 0;
// fill data until the big token is covered (but not necessarily parsed)
while (offset < *leading + *bigtoken) {
assert_true(offset + *fillsize <= document_length);
const enum XML_Status status
= XML_Parse(parser, &document[offset], *fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
offset += *fillsize;
}
// Now, check that we've had a buffer allocation that could fit the
// context bytes and our big token. In order to detect a special case,
// we need to know how many bytes of our big token were included in the
// first push that contained _any_ bytes of the big token:
const int bigtok_first_chunk_bytes = *fillsize - (*leading % *fillsize);
if (bigtok_first_chunk_bytes >= *bigtoken && XML_CONTEXT_BYTES == 0) {
// Special case: we aren't saving any context, and the whole big token
// was covered by a single fill, so Expat may have parsed directly
// from our input pointer, without allocating an internal buffer.
} else if (*leading < XML_CONTEXT_BYTES) {
assert_true(g_biggestAlloc >= *leading + (size_t)*bigtoken);
} else {
assert_true(g_biggestAlloc >= XML_CONTEXT_BYTES + (size_t)*bigtoken);
}
// fill data until the big token is actually parsed
while (storage.count < expected_elem_total) {
const size_t alloc_before = g_totalAlloc;
assert_true(offset + *fillsize <= document_length);
const enum XML_Status status
= XML_Parse(parser, &document[offset], *fillsize, XML_FALSE);
if (status != XML_STATUS_OK) {
xml_failure(parser);
}
offset += *fillsize;
// since all the bytes of the big token are already in the buffer,
// the bufsize ceiling should make us finish its parsing without any
// further buffer allocations. We assume that there will be no other
// large allocations in this test.
assert_true(g_totalAlloc - alloc_before < 4096);
}
// test-the-test: was our alloc even called?
assert_true(g_totalAlloc > 0);
// test-the-test: there shouldn't be any extra start elements
assert_true(storage.count == expected_elem_total);
XML_ParserFree(parser);
}
}
// the heuristic should defer parsing, so the error is not reported yet
if (XML_ParseBuffer(parser, fillsize, XML_FALSE) != XML_STATUS_OK)
xml_failure(parser);
}
// last fill, should actually parse and detect the error
{
set_subtest("last fill");
void *const buf = XML_GetBuffer(parser, fillsize);
assert_true(buf != NULL);
// Using isFinal=XML_FALSE here is important: we want to check that the
// heuristic correctly detects the "close to out-of-memory" situation and
// actually parses the pending data. XML_TRUE would force the heuristic to
// parse regardless, which is not what we want.
if (XML_ParseBuffer(parser, fillsize, XML_FALSE) != XML_STATUS_ERROR)
fail("expected parse error");
assert_true(XML_GetErrorCode(parser) == XML_ERROR_TAG_MISMATCH);
}
XML_ParserFree(parser);
*/
free(document);
}
END_TEST
@ -5904,5 +5947,5 @@ make_basic_test_case(Suite *s) {
tcase_add_test(tc_basic, test_set_reparse_deferral_on_null_parser);
tcase_add_test(tc_basic, test_set_reparse_deferral_on_the_fly);
tcase_add_test(tc_basic, test_set_bad_reparse_option);
tcase_add_test(tc_basic, test_bypass_heuristic_when_close_to_maxbuf);
tcase_add_test(tc_basic, test_bypass_heuristic_when_close_to_bufsize);
}