Bypass partial token heuristic when close to maximum buffer size

For huge tokens, we may end up in a situation where the partial token
parse deferral heuristic demands more bytes than Expat's maximum buffer
size (currently ~half of INT_MAX) could fit.

INT_MAX/2 is 1024 MiB on most systems. Clearly, a token of 950 MiB could
fit in that buffer, but the reparse threshold might be such that
callProcessor() will defer it, allowing the app to keep filling the
buffer until XML_GetBuffer() eventually returns a memory error.

By bypassing the heuristic when we're getting close to the maximum
buffer size, it will once again be possible to parse tokens in the size
range INT_MAX/2/ratio < size < INT_MAX/2 reliably.

We subtract the last buffer fill size as a way to detect that the next
XML_GetBuffer() call has a risk of returning a memory error -- assuming
that the application is likely to keep using the same (or smaller) fill.

We subtract XML_CONTEXT_BYTES because that's the maximum amount of bytes
that could remain at the start of the buffer, preceding the partial
token. Technically, it could be fewer bytes, but XML_CONTEXT_BYTES is
normally small relative to INT_MAX, and is much simpler to use.

Co-authored-by: Sebastian Pipping <sebastian@pipping.org>
This commit is contained in:
Snild Dolkow 2023-10-04 16:00:14 +02:00
parent ad9c01be8e
commit 60b7420989
2 changed files with 115 additions and 1 deletions

View file

@ -213,6 +213,8 @@ typedef char ICHAR;
/* Do safe (NULL-aware) pointer arithmetic */
#define EXPAT_SAFE_PTR_DIFF(p, q) (((p) && (q)) ? ((p) - (q)) : 0)
#define EXPAT_MIN(a, b) (((a) < (b)) ? (a) : (b))
#include "internal.h"
#include "xmltok.h"
#include "xmlrole.h"
@ -652,6 +654,7 @@ struct XML_ParserStruct {
const char *m_parseEndPtr;
size_t m_partialTokenBytesBefore; /* used in heuristic to avoid O(n^2) */
XML_Bool m_reparseDeferralEnabled;
int m_lastBufferRequestSize;
XML_Char *m_dataBuf;
XML_Char *m_dataBufEnd;
XML_StartElementHandler m_startElementHandler;
@ -993,7 +996,18 @@ callProcessor(XML_Parser parser, const char *start, const char *end,
// Heuristic: don't try to parse a partial token again until the amount of
// available data has increased significantly.
const size_t had_before = parser->m_partialTokenBytesBefore;
const bool enough = (have_now >= 2 * had_before);
// ...but *do* try anyway if we're close to reaching the max buffer size.
size_t close_to_maxbuf = INT_MAX / 2 + (INT_MAX & 1); // round up
#if XML_CONTEXT_BYTES > 0
// subtract XML_CONTEXT_BYTES, but don't go below zero
close_to_maxbuf -= EXPAT_MIN(close_to_maxbuf, XML_CONTEXT_BYTES);
#endif
// subtract the last buffer fill size, but don't go below zero
// m_lastBufferRequestSize is never assigned a value < 0, so the cast is ok
close_to_maxbuf
-= EXPAT_MIN(close_to_maxbuf, (size_t)parser->m_lastBufferRequestSize);
const bool enough
= (have_now >= 2 * had_before) || (have_now > close_to_maxbuf);
if (! enough) {
*endPtr = start; // callers may expect this to be set
@ -1195,6 +1209,7 @@ parserInit(XML_Parser parser, const XML_Char *encodingName) {
parser->m_parseEndPtr = NULL;
parser->m_partialTokenBytesBefore = 0;
parser->m_reparseDeferralEnabled = g_reparseDeferralEnabledDefault;
parser->m_lastBufferRequestSize = 0;
parser->m_declElementType = NULL;
parser->m_declAttributeId = NULL;
parser->m_declEntity = NULL;
@ -1929,6 +1944,9 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
parser->m_processor = errorProcessor;
return XML_STATUS_ERROR;
}
// though this isn't a buffer request, we assume that `len` is the app's
// preferred buffer fill size, and therefore save it here.
parser->m_lastBufferRequestSize = len;
parser->m_parseEndByteIndex += len;
parser->m_positionPtr = s;
parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;
@ -1967,6 +1985,9 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
parser->m_parsingStatus.parsing = XML_PARSING;
void *const temp = XML_GetBuffer(parser, nLeftOver);
parser->m_parsingStatus.parsing = originalStatus;
// GetBuffer may have overwritten this, but we want to remember what the
// app requested, not how many bytes were left over after parsing.
parser->m_lastBufferRequestSize = len;
if (temp == NULL) {
// NOTE: parser->m_errorCode has already been set by XML_GetBuffer().
parser->m_eventPtr = parser->m_eventEndPtr = NULL;
@ -2081,6 +2102,9 @@ XML_GetBuffer(XML_Parser parser, int len) {
default:;
}
// whether or not the request succeeds, `len` seems to be the app's preferred
// buffer fill size; remember it.
parser->m_lastBufferRequestSize = len;
if (len > EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd)
|| parser->m_buffer == NULL) {
#if XML_CONTEXT_BYTES > 0

View file

@ -5569,6 +5569,95 @@ START_TEST(test_set_bad_reparse_option) {
}
END_TEST
START_TEST(test_bypass_heuristic_when_close_to_maxbuf) {
/* There used to be a test here, but one of its dependencies was removed in
a rebase. Since it will be replaced by test_..._when_close_to_bufsize() in
the next commit, it was not worth fixing.
// this test is slow; avoid running it multiple times for no reason.
if (g_chunkSize != 0) {
return; // we don't use SINGLE_BYTES
}
if (! g_reparseDeferralEnabledDefault) {
return; // this test is irrelevant when the deferral heuristic is disabled.
}
const int maxbuf = INT_MAX / 2 + (INT_MAX & 1); // round up without overflow
const int fillsize = 1024 * 1024;
// we expect to be able to fill this many times, and no more.
// For example, in the common case of INT_MAX == 2³¹-1:
// * maxbuf will be exactly 1 GiB (1024 * 1024 * 1024 bytes)
// * that means Expat should be able to handle 1024 fills
// * ...but XML_CONTEXT_BYTES can steal some of it from us.
const int expected_fills = (maxbuf - XML_CONTEXT_BYTES) / fillsize;
// Just to make sure the test isn't completely broken, check that
// expected_fills is reasonable for a common setup where int is at
// least 32 bits, and XML_CONTEXT_BYTES is no more than 2 MiB.
if (sizeof(int) >= 4 && XML_CONTEXT_BYTES <= 2 * fillsize) {
assert_true(expected_fills >= 1022);
}
XML_Parser parser = XML_ParserCreate(NULL);
assert_true(parser != NULL);
// make the deferral heuristic's threshold grow *extremely* quickly.
assert_true(XML_SetReparseDeferralRatio(parser, (float)INT_MAX));
// first fill, will push the heuristic threshold beyond the max buffer size
{
set_subtest("first fill");
char *const buf = (char *)XML_GetBuffer(parser, fillsize);
assert_true(buf != NULL);
memset(buf, 'a', fillsize);
buf[0] = '<';
if (XML_ParseBuffer(parser, fillsize, XML_FALSE) != XML_STATUS_OK)
xml_failure(parser);
}
// second fill, with data that is not well-formed
{
set_subtest("second fill");
char *const buf = (char *)XML_GetBuffer(parser, fillsize);
assert_true(buf != NULL);
strcpy(buf, "></wrongend>"); // leaving the rest of the bytes uninitialized
// the heuristic should defer parsing, so the error is not reported yet
if (XML_ParseBuffer(parser, fillsize, XML_FALSE) != XML_STATUS_OK)
xml_failure(parser);
}
// lots more fills, with uninitialized data (so the test goes fast)
// the 3 here is for the hardcoded "first"/"second"/"last" fills.
for (int fill = 3; fill < expected_fills; ++fill) {
set_subtest("fill #%d", fill);
void *const buf = XML_GetBuffer(parser, fillsize);
if (buf == NULL) {
XML_ParserFree(parser); // avoid leaking our many-MiB parser
#if defined(_WIN32) && ! defined(_WIN64)
// workaround for win[e]32 on GitHub CI not being able to reach 1GiB
return;
#else
fail("buffer is NULL");
#endif
}
// the heuristic should defer parsing, so the error is not reported yet
if (XML_ParseBuffer(parser, fillsize, XML_FALSE) != XML_STATUS_OK)
xml_failure(parser);
}
// last fill, should actually parse and detect the error
{
set_subtest("last fill");
void *const buf = XML_GetBuffer(parser, fillsize);
assert_true(buf != NULL);
// Using isFinal=XML_FALSE here is important: we want to check that the
// heuristic correctly detects the "close to out-of-memory" situation and
// actually parses the pending data. XML_TRUE would force the heuristic to
// parse regardless, which is not what we want.
if (XML_ParseBuffer(parser, fillsize, XML_FALSE) != XML_STATUS_ERROR)
fail("expected parse error");
assert_true(XML_GetErrorCode(parser) == XML_ERROR_TAG_MISMATCH);
}
XML_ParserFree(parser);
*/
}
END_TEST
void
make_basic_test_case(Suite *s) {
TCase *tc_basic = tcase_create("basic tests");
@ -5815,4 +5904,5 @@ make_basic_test_case(Suite *s) {
tcase_add_test(tc_basic, test_set_reparse_deferral_on_null_parser);
tcase_add_test(tc_basic, test_set_reparse_deferral_on_the_fly);
tcase_add_test(tc_basic, test_set_bad_reparse_option);
tcase_add_test(tc_basic, test_bypass_heuristic_when_close_to_maxbuf);
}