mirror of
https://github.com/libexpat/libexpat.git
synced 2025-04-06 13:45:00 +00:00
Bypass partial token heuristic when nearing full buffer
...instead of only when approaching the maximum buffer size INT/2+1. We'd like to give applications a chance to finish parsing a large token before buffer reallocation, in case the reallocation fails. By bypassing the reparse deferral heuristic when getting close to the filling the buffer, we give them this chance -- if the whole token is present in the buffer, it will be parsed at that time. This may come at the cost of some extra reparse attempts. For a token of n bytes, these extra parses cause us to scan over a maximum of 2n bytes (... + n/8 + n/4 + n/2 + n). Therefore, parsing of big tokens remains O(n) in regard how many bytes we scan in attempts to parse. The cost in reality is lower than that, since the reparses that happen due to the bypass will affect m_partialTokenBytesBefore, delaying the next ratio-based reparse. Furthermore, only the first token that "breaks through" a buffer ceiling takes that extra reparse attempt; subsequent large tokens will only bypass the heuristic if they manage to hit the new buffer ceiling. Note that this cost analysis depends on the assumption that Expat grows its buffer by doubling it (or, more generally, grows it exponentially). If this changes, the cost of this bypass may increase. Hopefully, this would be caught by test_big_tokens_take_linear_time or the new test. The bypass logic assumes that the application uses a consistent fill. If the app increases its fill size, it may miss the bypass (and the normal heuristic will apply). If the app decreases its fill size, the bypass may be hit multiple times for the same buffer size. The very worst case would be to always fill half of the remaining buffer space, in which case parsing of a large n-byte token becomes O(n log n). As an added bonus, the new test case should be faster than the old one, since it doesn't have to go all the way to 1GiB to check the behavior. Finally, this change necessitated a small modification to two existing tests related to reparse deferral. These tests are testing the deferral enabled setting, and assume that reparsing will not happen for any other reason. By pre-growing the buffer, we make sure that this new deferral does not affect those test cases.
This commit is contained in:
parent
60b7420989
commit
3d8141d26a
2 changed files with 127 additions and 84 deletions
|
@ -996,18 +996,18 @@ callProcessor(XML_Parser parser, const char *start, const char *end,
|
|||
// Heuristic: don't try to parse a partial token again until the amount of
|
||||
// available data has increased significantly.
|
||||
const size_t had_before = parser->m_partialTokenBytesBefore;
|
||||
// ...but *do* try anyway if we're close to reaching the max buffer size.
|
||||
size_t close_to_maxbuf = INT_MAX / 2 + (INT_MAX & 1); // round up
|
||||
// ...but *do* try anyway if we're close to causing a reallocation.
|
||||
size_t available_buffer
|
||||
= EXPAT_SAFE_PTR_DIFF(parser->m_bufferPtr, parser->m_buffer);
|
||||
#if XML_CONTEXT_BYTES > 0
|
||||
// subtract XML_CONTEXT_BYTES, but don't go below zero
|
||||
close_to_maxbuf -= EXPAT_MIN(close_to_maxbuf, XML_CONTEXT_BYTES);
|
||||
available_buffer -= EXPAT_MIN(available_buffer, XML_CONTEXT_BYTES);
|
||||
#endif
|
||||
// subtract the last buffer fill size, but don't go below zero
|
||||
available_buffer
|
||||
+= EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd);
|
||||
// m_lastBufferRequestSize is never assigned a value < 0, so the cast is ok
|
||||
close_to_maxbuf
|
||||
-= EXPAT_MIN(close_to_maxbuf, (size_t)parser->m_lastBufferRequestSize);
|
||||
const bool enough
|
||||
= (have_now >= 2 * had_before) || (have_now > close_to_maxbuf);
|
||||
= (have_now >= 2 * had_before)
|
||||
|| ((size_t)parser->m_lastBufferRequestSize > available_buffer);
|
||||
|
||||
if (! enough) {
|
||||
*endPtr = start; // callers may expect this to be set
|
||||
|
|
|
@ -5318,6 +5318,8 @@ START_TEST(test_set_reparse_deferral) {
|
|||
XML_Parser parser = XML_ParserCreate(NULL);
|
||||
assert_true(parser != NULL);
|
||||
assert_true(XML_SetReparseDeferralEnabled(parser, enabled));
|
||||
// pre-grow the buffer to avoid reparsing due to almost-fullness
|
||||
assert_true(XML_GetBuffer(parser, fillsize * 10103) != NULL);
|
||||
|
||||
CharData storage;
|
||||
CharData_Init(&storage);
|
||||
|
@ -5407,6 +5409,8 @@ external_inherited_parser(XML_Parser p, const XML_Char *context,
|
|||
|
||||
XML_Parser parser = XML_ExternalEntityParserCreate(p, context, NULL);
|
||||
assert_true(parser != NULL);
|
||||
// pre-grow the buffer to avoid reparsing due to almost-fullness
|
||||
assert_true(XML_GetBuffer(parser, fillsize * 10103) != NULL);
|
||||
|
||||
struct element_decl_data testdata;
|
||||
testdata.parser = parser;
|
||||
|
@ -5569,92 +5573,131 @@ START_TEST(test_set_bad_reparse_option) {
|
|||
}
|
||||
END_TEST
|
||||
|
||||
START_TEST(test_bypass_heuristic_when_close_to_maxbuf) {
|
||||
/* There used to be a test here, but one of its dependencies was removed in
|
||||
a rebase. Since it will be replaced by test_..._when_close_to_bufsize() in
|
||||
the next commit, it was not worth fixing.
|
||||
static size_t g_totalAlloc = 0;
|
||||
static size_t g_biggestAlloc = 0;
|
||||
|
||||
// this test is slow; avoid running it multiple times for no reason.
|
||||
static void *
|
||||
counting_realloc(void *ptr, size_t size) {
|
||||
g_totalAlloc += size;
|
||||
if (size > g_biggestAlloc) {
|
||||
g_biggestAlloc = size;
|
||||
}
|
||||
return realloc(ptr, size);
|
||||
}
|
||||
|
||||
static void *
|
||||
counting_malloc(size_t size) {
|
||||
return counting_realloc(NULL, size);
|
||||
}
|
||||
|
||||
START_TEST(test_bypass_heuristic_when_close_to_bufsize) {
|
||||
if (g_chunkSize != 0) {
|
||||
return; // we don't use SINGLE_BYTES
|
||||
// this test does not use SINGLE_BYTES, because it depends on very precise
|
||||
// buffer fills.
|
||||
return;
|
||||
}
|
||||
if (! g_reparseDeferralEnabledDefault) {
|
||||
return; // this test is irrelevant when the deferral heuristic is disabled.
|
||||
}
|
||||
|
||||
const int maxbuf = INT_MAX / 2 + (INT_MAX & 1); // round up without overflow
|
||||
const int fillsize = 1024 * 1024;
|
||||
// we expect to be able to fill this many times, and no more.
|
||||
// For example, in the common case of INT_MAX == 2³¹-1:
|
||||
// * maxbuf will be exactly 1 GiB (1024 * 1024 * 1024 bytes)
|
||||
// * that means Expat should be able to handle 1024 fills
|
||||
// * ...but XML_CONTEXT_BYTES can steal some of it from us.
|
||||
const int expected_fills = (maxbuf - XML_CONTEXT_BYTES) / fillsize;
|
||||
// Just to make sure the test isn't completely broken, check that
|
||||
// expected_fills is reasonable for a common setup where int is at
|
||||
// least 32 bits, and XML_CONTEXT_BYTES is no more than 2 MiB.
|
||||
if (sizeof(int) >= 4 && XML_CONTEXT_BYTES <= 2 * fillsize) {
|
||||
assert_true(expected_fills >= 1022);
|
||||
}
|
||||
const int document_length = 65536;
|
||||
char *const document = (char *)malloc(document_length);
|
||||
|
||||
XML_Parser parser = XML_ParserCreate(NULL);
|
||||
assert_true(parser != NULL);
|
||||
// make the deferral heuristic's threshold grow *extremely* quickly.
|
||||
assert_true(XML_SetReparseDeferralRatio(parser, (float)INT_MAX));
|
||||
const XML_Memory_Handling_Suite memfuncs = {
|
||||
counting_malloc,
|
||||
counting_realloc,
|
||||
free,
|
||||
};
|
||||
|
||||
// first fill, will push the heuristic threshold beyond the max buffer size
|
||||
{
|
||||
set_subtest("first fill");
|
||||
char *const buf = (char *)XML_GetBuffer(parser, fillsize);
|
||||
assert_true(buf != NULL);
|
||||
memset(buf, 'a', fillsize);
|
||||
buf[0] = '<';
|
||||
if (XML_ParseBuffer(parser, fillsize, XML_FALSE) != XML_STATUS_OK)
|
||||
xml_failure(parser);
|
||||
}
|
||||
// second fill, with data that is not well-formed
|
||||
{
|
||||
set_subtest("second fill");
|
||||
char *const buf = (char *)XML_GetBuffer(parser, fillsize);
|
||||
assert_true(buf != NULL);
|
||||
strcpy(buf, "></wrongend>"); // leaving the rest of the bytes uninitialized
|
||||
// the heuristic should defer parsing, so the error is not reported yet
|
||||
if (XML_ParseBuffer(parser, fillsize, XML_FALSE) != XML_STATUS_OK)
|
||||
xml_failure(parser);
|
||||
}
|
||||
// lots more fills, with uninitialized data (so the test goes fast)
|
||||
// the 3 here is for the hardcoded "first"/"second"/"last" fills.
|
||||
for (int fill = 3; fill < expected_fills; ++fill) {
|
||||
set_subtest("fill #%d", fill);
|
||||
void *const buf = XML_GetBuffer(parser, fillsize);
|
||||
if (buf == NULL) {
|
||||
XML_ParserFree(parser); // avoid leaking our many-MiB parser
|
||||
#if defined(_WIN32) && ! defined(_WIN64)
|
||||
// workaround for win[e]32 on GitHub CI not being able to reach 1GiB
|
||||
return;
|
||||
#else
|
||||
fail("buffer is NULL");
|
||||
#endif
|
||||
const int leading_list[] = {0, 3, 61, 96, 400, 401, 4000, 4010, 4099, -1};
|
||||
const int bigtoken_list[] = {3000, 4000, 4001, 4096, 4099, 5000, 20000, -1};
|
||||
const int fillsize_list[] = {131, 256, 399, 400, 401, 1025, 4099, 4321, -1};
|
||||
|
||||
for (const int *leading = leading_list; *leading >= 0; leading++) {
|
||||
for (const int *bigtoken = bigtoken_list; *bigtoken >= 0; bigtoken++) {
|
||||
for (const int *fillsize = fillsize_list; *fillsize >= 0; fillsize++) {
|
||||
set_subtest("leading=%d bigtoken=%d fillsize=%d", *leading, *bigtoken,
|
||||
*fillsize);
|
||||
// start by checking that the test looks reasonably valid
|
||||
assert_true(*leading + *bigtoken <= document_length);
|
||||
|
||||
// put 'x' everywhere; some will be overwritten by elements.
|
||||
memset(document, 'x', document_length);
|
||||
// maybe add an initial tag
|
||||
if (*leading) {
|
||||
assert_true(*leading >= 3); // or the test case is invalid
|
||||
memcpy(document, "<a>", 3);
|
||||
}
|
||||
// add the large token
|
||||
document[*leading + 0] = '<';
|
||||
document[*leading + 1] = 'b';
|
||||
memset(&document[*leading + 2], ' ', *bigtoken - 2); // a spacy token
|
||||
document[*leading + *bigtoken - 1] = '>';
|
||||
|
||||
// 1 for 'b', plus 1 or 0 depending on the presence of 'a'
|
||||
const int expected_elem_total = 1 + (*leading ? 1 : 0);
|
||||
|
||||
XML_Parser parser = XML_ParserCreate_MM(NULL, &memfuncs, NULL);
|
||||
assert_true(parser != NULL);
|
||||
|
||||
CharData storage;
|
||||
CharData_Init(&storage);
|
||||
XML_SetUserData(parser, &storage);
|
||||
XML_SetStartElementHandler(parser, start_element_event_handler);
|
||||
|
||||
g_biggestAlloc = 0;
|
||||
g_totalAlloc = 0;
|
||||
int offset = 0;
|
||||
// fill data until the big token is covered (but not necessarily parsed)
|
||||
while (offset < *leading + *bigtoken) {
|
||||
assert_true(offset + *fillsize <= document_length);
|
||||
const enum XML_Status status
|
||||
= XML_Parse(parser, &document[offset], *fillsize, XML_FALSE);
|
||||
if (status != XML_STATUS_OK) {
|
||||
xml_failure(parser);
|
||||
}
|
||||
offset += *fillsize;
|
||||
}
|
||||
// Now, check that we've had a buffer allocation that could fit the
|
||||
// context bytes and our big token. In order to detect a special case,
|
||||
// we need to know how many bytes of our big token were included in the
|
||||
// first push that contained _any_ bytes of the big token:
|
||||
const int bigtok_first_chunk_bytes = *fillsize - (*leading % *fillsize);
|
||||
if (bigtok_first_chunk_bytes >= *bigtoken && XML_CONTEXT_BYTES == 0) {
|
||||
// Special case: we aren't saving any context, and the whole big token
|
||||
// was covered by a single fill, so Expat may have parsed directly
|
||||
// from our input pointer, without allocating an internal buffer.
|
||||
} else if (*leading < XML_CONTEXT_BYTES) {
|
||||
assert_true(g_biggestAlloc >= *leading + (size_t)*bigtoken);
|
||||
} else {
|
||||
assert_true(g_biggestAlloc >= XML_CONTEXT_BYTES + (size_t)*bigtoken);
|
||||
}
|
||||
// fill data until the big token is actually parsed
|
||||
while (storage.count < expected_elem_total) {
|
||||
const size_t alloc_before = g_totalAlloc;
|
||||
assert_true(offset + *fillsize <= document_length);
|
||||
const enum XML_Status status
|
||||
= XML_Parse(parser, &document[offset], *fillsize, XML_FALSE);
|
||||
if (status != XML_STATUS_OK) {
|
||||
xml_failure(parser);
|
||||
}
|
||||
offset += *fillsize;
|
||||
// since all the bytes of the big token are already in the buffer,
|
||||
// the bufsize ceiling should make us finish its parsing without any
|
||||
// further buffer allocations. We assume that there will be no other
|
||||
// large allocations in this test.
|
||||
assert_true(g_totalAlloc - alloc_before < 4096);
|
||||
}
|
||||
// test-the-test: was our alloc even called?
|
||||
assert_true(g_totalAlloc > 0);
|
||||
// test-the-test: there shouldn't be any extra start elements
|
||||
assert_true(storage.count == expected_elem_total);
|
||||
|
||||
XML_ParserFree(parser);
|
||||
}
|
||||
}
|
||||
// the heuristic should defer parsing, so the error is not reported yet
|
||||
if (XML_ParseBuffer(parser, fillsize, XML_FALSE) != XML_STATUS_OK)
|
||||
xml_failure(parser);
|
||||
}
|
||||
// last fill, should actually parse and detect the error
|
||||
{
|
||||
set_subtest("last fill");
|
||||
void *const buf = XML_GetBuffer(parser, fillsize);
|
||||
assert_true(buf != NULL);
|
||||
// Using isFinal=XML_FALSE here is important: we want to check that the
|
||||
// heuristic correctly detects the "close to out-of-memory" situation and
|
||||
// actually parses the pending data. XML_TRUE would force the heuristic to
|
||||
// parse regardless, which is not what we want.
|
||||
if (XML_ParseBuffer(parser, fillsize, XML_FALSE) != XML_STATUS_ERROR)
|
||||
fail("expected parse error");
|
||||
assert_true(XML_GetErrorCode(parser) == XML_ERROR_TAG_MISMATCH);
|
||||
}
|
||||
XML_ParserFree(parser);
|
||||
*/
|
||||
free(document);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
|
@ -5904,5 +5947,5 @@ make_basic_test_case(Suite *s) {
|
|||
tcase_add_test(tc_basic, test_set_reparse_deferral_on_null_parser);
|
||||
tcase_add_test(tc_basic, test_set_reparse_deferral_on_the_fly);
|
||||
tcase_add_test(tc_basic, test_set_bad_reparse_option);
|
||||
tcase_add_test(tc_basic, test_bypass_heuristic_when_close_to_maxbuf);
|
||||
tcase_add_test(tc_basic, test_bypass_heuristic_when_close_to_bufsize);
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue