Merge pull request #789 from SonyMobile/partial-token-perf

Speed up parsing of big tokens
2025-04-13 08:02:56 +00:00 · 2024-01-30 22:54:37 +01:00 · 2024-01-30 22:54:37 +01:00 · 34b598c5f5
commit 34b598c5f5
parent 2becc8a81d d5b02e96ab
22 changed files with 883 additions and 94 deletions
--- a/.github/workflows/data/exported-symbols.txt
+++ b/.github/workflows/data/exported-symbols.txt
@ -53,6 +53,7 @@ XML_SetNotationDeclHandler
 XML_SetNotStandaloneHandler
 XML_SetParamEntityParsing
 XML_SetProcessingInstructionHandler
+XML_SetReparseDeferralEnabled
 XML_SetReturnNSTriplet
 XML_SetSkippedEntityHandler
 XML_SetStartCdataSectionHandler
--- a/expat/Makefile.am
+++ b/expat/Makefile.am
@ -131,6 +131,11 @@ buildlib:
 run-benchmark:
 	$(MAKE) -C tests/benchmark
 	./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/recset.xml 65535 3
+	./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_attr.xml 4096 3
+	./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_cdata.xml 4096 3
+	./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_comment.xml 4096 3
+	./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_tag.xml 4096 3
+	./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_text.xml 4096 3

 .PHONY: download-xmlts-zip
 download-xmlts-zip:
--- a/expat/doc/reference.html
+++ b/expat/doc/reference.html
@ -152,10 +152,11 @@ interface.</p>
    </ul>
    </li>
    <li>
-      <a href="#billion-laughs">Billion Laughs Attack Protection</a>
+      <a href="#attack-protection">Attack Protection</a>
      <ul>
        <li><a href="#XML_SetBillionLaughsAttackProtectionMaximumAmplification">XML_SetBillionLaughsAttackProtectionMaximumAmplification</a></li>
        <li><a href="#XML_SetBillionLaughsAttackProtectionActivationThreshold">XML_SetBillionLaughsAttackProtectionActivationThreshold</a></li>
+        <li><a href="#XML_SetReparseDeferralEnabled">XML_SetReparseDeferralEnabled</a></li>
      </ul>
    </li>
    <li><a href="#miscellaneous">Miscellaneous Functions</a>
@ -2167,11 +2168,7 @@ parse position may be before the beginning of the buffer.</p>
 return <code>NULL</code>.</p>
 </div>

-<h3><a name="billion-laughs">Billion Laughs Attack Protection</a></h3>
-
-<p>The functions in this section configure the built-in
-  protection against various forms of
-  <a href="https://en.wikipedia.org/wiki/Billion_laughs_attack">billion laughs attacks</a>.</p>
+<h3><a name="attack-protection">Attack Protection</a><a name="billion-laughs"></a></h3>

 <h4 id="XML_SetBillionLaughsAttackProtectionMaximumAmplification">XML_SetBillionLaughsAttackProtectionMaximumAmplification</h4>
 <pre class="fcndec">
@ -2259,6 +2256,27 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold(XML_Parser p,
  </p>
 </div>

+<h4 id="XML_SetReparseDeferralEnabled">XML_SetReparseDeferralEnabled</h4>
+<pre class="fcndec">
+/* Added in Expat 2.6.0. */
+XML_Bool XMLCALL
+XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled);
+</pre>
+<div class="fcndef">
+  <p>
+    Large tokens may require many parse calls before enough data is available for Expat to parse it in full.
+    If Expat retried parsing the token on every parse call, parsing could take quadratic time.
+    To avoid this, Expat only retries once a significant amount of new data is available.
+    This function allows disabling this behavior.
+  </p>
+  <p>
+    The <code>enabled</code> argument should be <code>XML_TRUE</code> or <code>XML_FALSE</code>.
+  </p>
+  <p>
+    Returns <code>XML_TRUE</code> on success, and <code>XML_FALSE</code> on error.
+  </p>
+</div>
+
 <h3><a name="miscellaneous">Miscellaneous functions</a></h3>

 <p>The functions in this section either obtain state information from
--- a/expat/doc/xmlwf.xml
+++ b/expat/doc/xmlwf.xml
@ -342,6 +342,16 @@ supports both.
        </listitem>
      </varlistentry>

+      <varlistentry>
+        <term><option>-q</option></term>
+        <listitem>
+          <para>
+            Disable reparse deferral, and allow quadratic parse runtime
+            on large tokens (default: reparse deferral enabled).
+          </para>
+        </listitem>
+      </varlistentry>
+
      <varlistentry>
        <term><option>-r</option></term>
        <listitem>
--- a/expat/lib/expat.h
+++ b/expat/lib/expat.h
@ -16,6 +16,7 @@
   Copyright (c) 2016      Thomas Beutlich <tc@tbeu.de>
   Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
   Copyright (c) 2022      Thijs Schreijer <thijs@thijsschreijer.nl>
+   Copyright (c) 2023      Sony Corporation / Snild Dolkow <snild@sony.com>
   Licensed under the MIT license:

   Permission is  hereby granted,  free of charge,  to any  person obtaining
@ -1054,6 +1055,10 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold(
    XML_Parser parser, unsigned long long activationThresholdBytes);
 #endif

+/* Added in Expat 2.6.0. */
+XMLPARSEAPI(XML_Bool)
+XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled);
+
 /* Expat follows the semantic versioning convention.
   See https://semver.org
 */
--- a/expat/lib/internal.h
+++ b/expat/lib/internal.h
@ -31,6 +31,7 @@
   Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
   Copyright (c) 2018      Yury Gribov <tetra2005@gmail.com>
   Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
+   Copyright (c) 2023      Sony Corporation / Snild Dolkow <snild@sony.com>
   Licensed under the MIT license:

   Permission is  hereby granted,  free of charge,  to any  person obtaining
@ -160,6 +161,9 @@ unsigned long long testingAccountingGetCountBytesIndirect(XML_Parser parser);
 const char *unsignedCharToPrintable(unsigned char c);
 #endif

+extern XML_Bool g_reparseDeferralEnabledDefault; // written ONLY in runtests.c
+extern unsigned int g_parseAttempts;             // used for testing only
+
 #ifdef __cplusplus
 }
 #endif
--- a/expat/lib/libexpat.def.cmake
+++ b/expat/lib/libexpat.def.cmake
@ -77,3 +77,5 @@ EXPORTS
 ; added with version 2.4.0
@_EXPAT_COMMENT_DTD_OR_GE@ XML_SetBillionLaughsAttackProtectionActivationThreshold @69
@_EXPAT_COMMENT_DTD_OR_GE@ XML_SetBillionLaughsAttackProtectionMaximumAmplification @70
+; added with version 2.6.0
+  XML_SetReparseDeferralEnabled @71
--- a/expat/lib/xmlparse.c
+++ b/expat/lib/xmlparse.c
@ -89,6 +89,7 @@
 #  endif
 #endif

+#include <stdbool.h>
 #include <stddef.h>
 #include <string.h> /* memset(), memcpy() */
 #include <assert.h>
@ -212,6 +213,8 @@ typedef char ICHAR;
 /* Do safe (NULL-aware) pointer arithmetic */
 #define EXPAT_SAFE_PTR_DIFF(p, q) (((p) && (q)) ? ((p) - (q)) : 0)

+#define EXPAT_MIN(a, b) (((a) < (b)) ? (a) : (b))
+
 #include "internal.h"
 #include "xmltok.h"
 #include "xmlrole.h"
@ -624,6 +627,9 @@ static unsigned long getDebugLevel(const char *variableName,
       ? 0                                                                     \
       : ((*((pool)->ptr)++ = c), 1))

+XML_Bool g_reparseDeferralEnabledDefault = XML_TRUE; // write ONLY in runtests.c
+unsigned int g_parseAttempts = 0;                    // used for testing only
+
 struct XML_ParserStruct {
  /* The first member must be m_userData so that the XML_GetUserData
     macro works. */
@ -647,6 +653,9 @@ struct XML_ParserStruct {

  XML_Index m_parseEndByteIndex;
  const char *m_parseEndPtr;
+  size_t m_partialTokenBytesBefore; /* used in heuristic to avoid O(n^2) */
+  XML_Bool m_reparseDeferralEnabled;
+  int m_lastBufferRequestSize;
  XML_Char *m_dataBuf;
  XML_Char *m_dataBufEnd;
  XML_StartElementHandler m_startElementHandler;
@ -978,6 +987,47 @@ get_hash_secret_salt(XML_Parser parser) {
  return parser->m_hash_secret_salt;
 }

+static enum XML_Error
+callProcessor(XML_Parser parser, const char *start, const char *end,
+              const char **endPtr) {
+  const size_t have_now = EXPAT_SAFE_PTR_DIFF(end, start);
+
+  if (parser->m_reparseDeferralEnabled
+      && ! parser->m_parsingStatus.finalBuffer) {
+    // Heuristic: don't try to parse a partial token again until the amount of
+    // available data has increased significantly.
+    const size_t had_before = parser->m_partialTokenBytesBefore;
+    // ...but *do* try anyway if we're close to causing a reallocation.
+    size_t available_buffer
+        = EXPAT_SAFE_PTR_DIFF(parser->m_bufferPtr, parser->m_buffer);
+#if XML_CONTEXT_BYTES > 0
+    available_buffer -= EXPAT_MIN(available_buffer, XML_CONTEXT_BYTES);
+#endif
+    available_buffer
+        += EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd);
+    // m_lastBufferRequestSize is never assigned a value < 0, so the cast is ok
+    const bool enough
+        = (have_now >= 2 * had_before)
+          || ((size_t)parser->m_lastBufferRequestSize > available_buffer);
+
+    if (! enough) {
+      *endPtr = start; // callers may expect this to be set
+      return XML_ERROR_NONE;
+    }
+  }
+  g_parseAttempts += 1;
+  const enum XML_Error ret = parser->m_processor(parser, start, end, endPtr);
+  if (ret == XML_ERROR_NONE) {
+    // if we consumed nothing, remember what we had on this parse attempt.
+    if (*endPtr == start) {
+      parser->m_partialTokenBytesBefore = have_now;
+    } else {
+      parser->m_partialTokenBytesBefore = 0;
+    }
+  }
+  return ret;
+}
+
 static XML_Bool /* only valid for root parser */
 startParsing(XML_Parser parser) {
  /* hash functions must be initialized before setContext() is called */
@ -1159,6 +1209,9 @@ parserInit(XML_Parser parser, const XML_Char *encodingName) {
  parser->m_bufferEnd = parser->m_buffer;
  parser->m_parseEndByteIndex = 0;
  parser->m_parseEndPtr = NULL;
+  parser->m_partialTokenBytesBefore = 0;
+  parser->m_reparseDeferralEnabled = g_reparseDeferralEnabledDefault;
+  parser->m_lastBufferRequestSize = 0;
  parser->m_declElementType = NULL;
  parser->m_declAttributeId = NULL;
  parser->m_declEntity = NULL;
@ -1328,6 +1381,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser, const XML_Char *context,
     to worry which hash secrets each table has.
  */
  unsigned long oldhash_secret_salt;
+  XML_Bool oldReparseDeferralEnabled;

  /* Validate the oldParser parameter before we pull everything out of it */
  if (oldParser == NULL)
@ -1372,6 +1426,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser, const XML_Char *context,
     to worry which hash secrets each table has.
  */
  oldhash_secret_salt = parser->m_hash_secret_salt;
+  oldReparseDeferralEnabled = parser->m_reparseDeferralEnabled;

 #ifdef XML_DTD
  if (! context)
@ -1424,6 +1479,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser, const XML_Char *context,
  parser->m_defaultExpandInternalEntities = oldDefaultExpandInternalEntities;
  parser->m_ns_triplets = oldns_triplets;
  parser->m_hash_secret_salt = oldhash_secret_salt;
+  parser->m_reparseDeferralEnabled = oldReparseDeferralEnabled;
  parser->m_parentParser = oldParser;
 #ifdef XML_DTD
  parser->m_paramEntityParsing = oldParamEntityParsing;
@ -1878,55 +1934,8 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
    parser->m_parsingStatus.parsing = XML_PARSING;
  }

-  if (len == 0) {
-    parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;
-    if (! isFinal)
-      return XML_STATUS_OK;
-    parser->m_positionPtr = parser->m_bufferPtr;
-    parser->m_parseEndPtr = parser->m_bufferEnd;
-
-    /* If data are left over from last buffer, and we now know that these
-       data are the final chunk of input, then we have to check them again
-       to detect errors based on that fact.
-    */
-    parser->m_errorCode
-        = parser->m_processor(parser, parser->m_bufferPtr,
-                              parser->m_parseEndPtr, &parser->m_bufferPtr);
-
-    if (parser->m_errorCode == XML_ERROR_NONE) {
-      switch (parser->m_parsingStatus.parsing) {
-      case XML_SUSPENDED:
-        /* It is hard to be certain, but it seems that this case
-         * cannot occur.  This code is cleaning up a previous parse
-         * with no new data (since len == 0).  Changing the parsing
-         * state requires getting to execute a handler function, and
-         * there doesn't seem to be an opportunity for that while in
-         * this circumstance.
-         *
-         * Given the uncertainty, we retain the code but exclude it
-         * from coverage tests.
-         *
-         * LCOV_EXCL_START
-         */
-        XmlUpdatePosition(parser->m_encoding, parser->m_positionPtr,
-                          parser->m_bufferPtr, &parser->m_position);
-        parser->m_positionPtr = parser->m_bufferPtr;
-        return XML_STATUS_SUSPENDED;
-        /* LCOV_EXCL_STOP */
-      case XML_INITIALIZED:
-      case XML_PARSING:
-        parser->m_parsingStatus.parsing = XML_FINISHED;
-        /* fall through */
-      default:
-        return XML_STATUS_OK;
-      }
-    }
-    parser->m_eventEndPtr = parser->m_eventPtr;
-    parser->m_processor = errorProcessor;
-    return XML_STATUS_ERROR;
-  }
 #if XML_CONTEXT_BYTES == 0
-  else if (parser->m_bufferPtr == parser->m_bufferEnd) {
+  if (parser->m_bufferPtr == parser->m_bufferEnd) {
    const char *end;
    int nLeftOver;
    enum XML_Status result;
@ -1937,12 +1946,15 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
      parser->m_processor = errorProcessor;
      return XML_STATUS_ERROR;
    }
+    // though this isn't a buffer request, we assume that `len` is the app's
+    // preferred buffer fill size, and therefore save it here.
+    parser->m_lastBufferRequestSize = len;
    parser->m_parseEndByteIndex += len;
    parser->m_positionPtr = s;
    parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;

    parser->m_errorCode
-        = parser->m_processor(parser, s, parser->m_parseEndPtr = s + len, &end);
+        = callProcessor(parser, s, parser->m_parseEndPtr = s + len, &end);

    if (parser->m_errorCode != XML_ERROR_NONE) {
      parser->m_eventEndPtr = parser->m_eventPtr;
@ -1975,6 +1987,9 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
      parser->m_parsingStatus.parsing = XML_PARSING;
      void *const temp = XML_GetBuffer(parser, nLeftOver);
      parser->m_parsingStatus.parsing = originalStatus;
+      // GetBuffer may have overwritten this, but we want to remember what the
+      // app requested, not how many bytes were left over after parsing.
+      parser->m_lastBufferRequestSize = len;
      if (temp == NULL) {
        // NOTE: parser->m_errorCode has already been set by XML_GetBuffer().
        parser->m_eventPtr = parser->m_eventEndPtr = NULL;
@ -1996,15 +2011,14 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
    return result;
  }
 #endif /* XML_CONTEXT_BYTES == 0 */
-  else {
-    void *buff = XML_GetBuffer(parser, len);
-    if (buff == NULL)
-      return XML_STATUS_ERROR;
-    else {
-      memcpy(buff, s, len);
-      return XML_ParseBuffer(parser, len, isFinal);
-    }
+  void *buff = XML_GetBuffer(parser, len);
+  if (buff == NULL)
+    return XML_STATUS_ERROR;
+  if (len > 0) {
+    assert(s != NULL); // make sure s==NULL && len!=0 was rejected above
+    memcpy(buff, s, len);
  }
+  return XML_ParseBuffer(parser, len, isFinal);
 }

 enum XML_Status XMLCALL
@ -2044,8 +2058,8 @@ XML_ParseBuffer(XML_Parser parser, int len, int isFinal) {
  parser->m_parseEndByteIndex += len;
  parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;

-  parser->m_errorCode = parser->m_processor(
-      parser, start, parser->m_parseEndPtr, &parser->m_bufferPtr);
+  parser->m_errorCode = callProcessor(parser, start, parser->m_parseEndPtr,
+                                      &parser->m_bufferPtr);

  if (parser->m_errorCode != XML_ERROR_NONE) {
    parser->m_eventEndPtr = parser->m_eventPtr;
@ -2090,7 +2104,11 @@ XML_GetBuffer(XML_Parser parser, int len) {
  default:;
  }

-  if (len > EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd)) {
+  // whether or not the request succeeds, `len` seems to be the app's preferred
+  // buffer fill size; remember it.
+  parser->m_lastBufferRequestSize = len;
+  if (len > EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd)
+      || parser->m_buffer == NULL) {
 #if XML_CONTEXT_BYTES > 0
    int keep;
 #endif /* XML_CONTEXT_BYTES > 0 */
@ -2113,8 +2131,9 @@ XML_GetBuffer(XML_Parser parser, int len) {
    }
    neededSize += keep;
 #endif /* XML_CONTEXT_BYTES > 0 */
-    if (neededSize
-        <= EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_buffer)) {
+    if (parser->m_buffer && parser->m_bufferPtr
+        && neededSize
+               <= EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_buffer)) {
 #if XML_CONTEXT_BYTES > 0
      if (keep < EXPAT_SAFE_PTR_DIFF(parser->m_bufferPtr, parser->m_buffer)) {
        int offset
@ -2128,14 +2147,12 @@ XML_GetBuffer(XML_Parser parser, int len) {
        parser->m_bufferPtr -= offset;
      }
 #else
-      if (parser->m_buffer && parser->m_bufferPtr) {
-        memmove(parser->m_buffer, parser->m_bufferPtr,
-                EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr));
-        parser->m_bufferEnd
-            = parser->m_buffer
-              + EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr);
-        parser->m_bufferPtr = parser->m_buffer;
-      }
+      memmove(parser->m_buffer, parser->m_bufferPtr,
+              EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr));
+      parser->m_bufferEnd
+          = parser->m_buffer
+            + EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr);
+      parser->m_bufferPtr = parser->m_buffer;
 #endif /* XML_CONTEXT_BYTES > 0 */
    } else {
      char *newBuf;
@ -2237,7 +2254,7 @@ XML_ResumeParser(XML_Parser parser) {
  }
  parser->m_parsingStatus.parsing = XML_PARSING;

-  parser->m_errorCode = parser->m_processor(
+  parser->m_errorCode = callProcessor(
      parser, parser->m_bufferPtr, parser->m_parseEndPtr, &parser->m_bufferPtr);

  if (parser->m_errorCode != XML_ERROR_NONE) {
@ -2592,6 +2609,15 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold(
 }
 #endif /* XML_GE == 1 */

+XML_Bool XMLCALL
+XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled) {
+  if (parser != NULL && (enabled == XML_TRUE || enabled == XML_FALSE)) {
+    parser->m_reparseDeferralEnabled = enabled;
+    return XML_TRUE;
+  }
+  return XML_FALSE;
+}
+
 /* Initially tag->rawName always points into the parse buffer;
   for those TAG instances opened while the current parse buffer was
   processed, and not yet closed, we need to store tag->rawName in a more
--- a/expat/tests/basic_tests.c
+++ b/expat/tests/basic_tests.c
@ -49,6 +49,7 @@

 #include <stdio.h>
 #include <string.h>
+#include <time.h>

 #if ! defined(__cplusplus)
 #  include <stdbool.h>
@ -2910,6 +2911,20 @@ START_TEST(test_buffer_can_grow_to_max) {
 }
 END_TEST

+START_TEST(test_getbuffer_allocates_on_zero_len) {
+  for (int first_len = 1; first_len >= 0; first_len--) {
+    set_subtest("with len=%d first", first_len);
+    XML_Parser parser = XML_ParserCreate(NULL);
+    assert_true(parser != NULL);
+    assert_true(XML_GetBuffer(parser, first_len) != NULL);
+    assert_true(XML_GetBuffer(parser, 0) != NULL);
+    if (XML_ParseBuffer(parser, 0, XML_FALSE) != XML_STATUS_OK)
+      xml_failure(parser);
+    XML_ParserFree(parser);
+  }
+}
+END_TEST
+
 /* Test position information macros */
 START_TEST(test_byte_info_at_end) {
  const char *text = "<doc></doc>";
@ -3148,7 +3163,7 @@ static int XMLCALL
 external_bom_checker(XML_Parser parser, const XML_Char *context,
                     const XML_Char *base, const XML_Char *systemId,
                     const XML_Char *publicId) {
-  const char *text = "";
+  const char *text;
  UNUSED_P(base);
  UNUSED_P(systemId);
  UNUSED_P(publicId);
@ -3625,7 +3640,9 @@ START_TEST(test_suspend_resume_internal_entity) {
  XML_SetStartElementHandler(g_parser, start_element_suspender);
  XML_SetCharacterDataHandler(g_parser, accumulate_characters);
  XML_SetUserData(g_parser, &storage);
-  if (_XML_Parse_SINGLE_BYTES(g_parser, text, (int)strlen(text), XML_TRUE)
+  // can't use SINGLE_BYTES here, because it'll return early on suspension, and
+  // we won't know exactly how much input we actually managed to give Expat.
+  if (XML_Parse(g_parser, text, (int)strlen(text), XML_TRUE)
      != XML_STATUS_SUSPENDED)
    xml_failure(g_parser);
  CharData_CheckXMLChars(&storage, XCS(""));
@ -4638,6 +4655,12 @@ START_TEST(test_utf8_in_start_tags) {
  char doc[1024];
  size_t failCount = 0;

+  // we need all the bytes to be parsed, but we don't want the errors that can
+  // trigger on isFinal=XML_TRUE, so we skip the test if the heuristic is on.
+  if (g_reparseDeferralEnabledDefault) {
+    return;
+  }
+
  for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
    size_t j = 0;
    for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
@ -5178,6 +5201,629 @@ START_TEST(test_nested_entity_suspend) {
 }
 END_TEST

+/* Regression test for quadratic parsing on large tokens */
+START_TEST(test_big_tokens_take_linear_time) {
+  const char *const too_slow_failure_message
+      = "Compared to the baseline runtime of the first test, this test has a "
+        "slowdown of more than <max_slowdown>. "
+        "Please keep increasing the value by 1 until it reliably passes the "
+        "test on your hardware and open a bug sharing that number with us. "
+        "Thanks in advance!";
+  const struct {
+    const char *pre;
+    const char *post;
+  } text[] = {
+      {"<a>", "</a>"},                      // assumed good, used as baseline
+      {"<b><![CDATA[ value: ", " ]]></b>"}, // CDATA, performed OK before patch
+      {"<c attr='", "'></c>"},              // big attribute, used to be O(N²)
+      {"<d><!-- ", " --></d>"},             // long comment, used to be O(N²)
+      {"<e><", "/></e>"},                   // big elem name, used to be O(N²)
+  };
+  const int num_cases = sizeof(text) / sizeof(text[0]);
+  // For the test we need a <max_slowdown> value that is:
+  // (1) big enough that the test passes reliably (avoiding flaky tests), and
+  // (2) small enough that the test actually catches regressions.
+  const int max_slowdown = 15;
+  char aaaaaa[4096];
+  const int fillsize = (int)sizeof(aaaaaa);
+  const int fillcount = 100;
+
+  memset(aaaaaa, 'a', fillsize);
+
+  if (! g_reparseDeferralEnabledDefault) {
+    return; // heuristic is disabled; we would get O(n^2) and fail.
+  }
+#if defined(_WIN32)
+  if (CLOCKS_PER_SEC < 100000) {
+    // Skip this test if clock() doesn't have reasonably good resolution.
+    // This workaround is only applied to Windows targets, since XSI requires
+    // the value to be 1 000 000 (10x the condition here), and we want to be
+    // very sure that at least one platform in CI can catch regressions.
+    return;
+  }
+#endif
+
+  clock_t baseline = 0;
+  for (int i = 0; i < num_cases; ++i) {
+    XML_Parser parser = XML_ParserCreate(NULL);
+    assert_true(parser != NULL);
+    enum XML_Status status;
+    set_subtest("max_slowdown=%d text=\"%saaaaaa%s\"", max_slowdown,
+                text[i].pre, text[i].post);
+    const clock_t start = clock();
+
+    // parse the start text
+    status = _XML_Parse_SINGLE_BYTES(parser, text[i].pre,
+                                     (int)strlen(text[i].pre), XML_FALSE);
+    if (status != XML_STATUS_OK) {
+      xml_failure(parser);
+    }
+    // parse lots of 'a', failing the test early if it takes too long
+    for (int f = 0; f < fillcount; ++f) {
+      status = _XML_Parse_SINGLE_BYTES(parser, aaaaaa, fillsize, XML_FALSE);
+      if (status != XML_STATUS_OK) {
+        xml_failure(parser);
+      }
+      // i == 0 means we're still calculating the baseline value
+      if (i > 0) {
+        const clock_t now = clock();
+        const clock_t clocks_so_far = now - start;
+        const int slowdown = clocks_so_far / baseline;
+        if (slowdown >= max_slowdown) {
+          fprintf(
+              stderr,
+              "fill#%d: clocks_so_far=%d baseline=%d slowdown=%d max_slowdown=%d\n",
+              f, (int)clocks_so_far, (int)baseline, slowdown, max_slowdown);
+          fail(too_slow_failure_message);
+        }
+      }
+    }
+    // parse the end text
+    status = _XML_Parse_SINGLE_BYTES(parser, text[i].post,
+                                     (int)strlen(text[i].post), XML_TRUE);
+    if (status != XML_STATUS_OK) {
+      xml_failure(parser);
+    }
+
+    // how long did it take in total?
+    const clock_t end = clock();
+    const clock_t taken = end - start;
+    if (i == 0) {
+      assert_true(taken > 0); // just to make sure we don't div-by-0 later
+      baseline = taken;
+    }
+    const int slowdown = taken / baseline;
+    if (slowdown >= max_slowdown) {
+      fprintf(stderr, "taken=%d baseline=%d slowdown=%d max_slowdown=%d\n",
+              (int)taken, (int)baseline, slowdown, max_slowdown);
+      fail(too_slow_failure_message);
+    }
+
+    XML_ParserFree(parser);
+  }
+}
+END_TEST
+
+START_TEST(test_set_reparse_deferral) {
+  const char *const pre = "<d>";
+  const char *const start = "<x attr='";
+  const char *const end = "'></x>";
+  char eeeeee[100];
+  const int fillsize = (int)sizeof(eeeeee);
+  memset(eeeeee, 'e', fillsize);
+
+  for (int enabled = 0; enabled <= 1; enabled += 1) {
+    set_subtest("deferral=%d", enabled);
+
+    XML_Parser parser = XML_ParserCreate(NULL);
+    assert_true(parser != NULL);
+    assert_true(XML_SetReparseDeferralEnabled(parser, enabled));
+    // pre-grow the buffer to avoid reparsing due to almost-fullness
+    assert_true(XML_GetBuffer(parser, fillsize * 10103) != NULL);
+
+    CharData storage;
+    CharData_Init(&storage);
+    XML_SetUserData(parser, &storage);
+    XML_SetStartElementHandler(parser, start_element_event_handler);
+
+    enum XML_Status status;
+    // parse the start text
+    status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE);
+    if (status != XML_STATUS_OK) {
+      xml_failure(parser);
+    }
+    CharData_CheckXMLChars(&storage, XCS("d")); // first element should be done
+
+    // ..and the start of the token
+    status = XML_Parse(parser, start, (int)strlen(start), XML_FALSE);
+    if (status != XML_STATUS_OK) {
+      xml_failure(parser);
+    }
+    CharData_CheckXMLChars(&storage, XCS("d")); // still just the first one
+
+    // try to parse lots of 'e', but the token isn't finished
+    for (int c = 0; c < 100; ++c) {
+      status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE);
+      if (status != XML_STATUS_OK) {
+        xml_failure(parser);
+      }
+    }
+    CharData_CheckXMLChars(&storage, XCS("d")); // *still* just the first one
+
+    // end the <x> token.
+    status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE);
+    if (status != XML_STATUS_OK) {
+      xml_failure(parser);
+    }
+
+    if (enabled) {
+      // In general, we may need to push more data to trigger a reparse attempt,
+      // but in this test, the data is constructed to always require it.
+      CharData_CheckXMLChars(&storage, XCS("d")); // or the test is incorrect
+      // 2x the token length should suffice; the +1 covers the start and end.
+      for (int c = 0; c < 101; ++c) {
+        status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE);
+        if (status != XML_STATUS_OK) {
+          xml_failure(parser);
+        }
+      }
+    }
+    CharData_CheckXMLChars(&storage, XCS("dx")); // the <x> should be done
+
+    XML_ParserFree(parser);
+  }
+}
+END_TEST
+
+struct element_decl_data {
+  XML_Parser parser;
+  int count;
+};
+
+static void
+element_decl_counter(void *userData, const XML_Char *name, XML_Content *model) {
+  UNUSED_P(name);
+  struct element_decl_data *testdata = (struct element_decl_data *)userData;
+  testdata->count += 1;
+  XML_FreeContentModel(testdata->parser, model);
+}
+
+static int
+external_inherited_parser(XML_Parser p, const XML_Char *context,
+                          const XML_Char *base, const XML_Char *systemId,
+                          const XML_Char *publicId) {
+  UNUSED_P(base);
+  UNUSED_P(systemId);
+  UNUSED_P(publicId);
+  const char *const pre = "<!ELEMENT document ANY>\n";
+  const char *const start = "<!ELEMENT ";
+  const char *const end = " ANY>\n";
+  const char *const post = "<!ELEMENT xyz ANY>\n";
+  const int enabled = *(int *)XML_GetUserData(p);
+  char eeeeee[100];
+  char spaces[100];
+  const int fillsize = (int)sizeof(eeeeee);
+  assert_true(fillsize == (int)sizeof(spaces));
+  memset(eeeeee, 'e', fillsize);
+  memset(spaces, ' ', fillsize);
+
+  XML_Parser parser = XML_ExternalEntityParserCreate(p, context, NULL);
+  assert_true(parser != NULL);
+  // pre-grow the buffer to avoid reparsing due to almost-fullness
+  assert_true(XML_GetBuffer(parser, fillsize * 10103) != NULL);
+
+  struct element_decl_data testdata;
+  testdata.parser = parser;
+  testdata.count = 0;
+  XML_SetUserData(parser, &testdata);
+  XML_SetElementDeclHandler(parser, element_decl_counter);
+
+  enum XML_Status status;
+  // parse the initial text
+  status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE);
+  if (status != XML_STATUS_OK) {
+    xml_failure(parser);
+  }
+  assert_true(testdata.count == 1); // first element should be done
+
+  // ..and the start of the big token
+  status = XML_Parse(parser, start, (int)strlen(start), XML_FALSE);
+  if (status != XML_STATUS_OK) {
+    xml_failure(parser);
+  }
+  assert_true(testdata.count == 1); // still just the first one
+
+  // try to parse lots of 'e', but the token isn't finished
+  for (int c = 0; c < 100; ++c) {
+    status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE);
+    if (status != XML_STATUS_OK) {
+      xml_failure(parser);
+    }
+  }
+  assert_true(testdata.count == 1); // *still* just the first one
+
+  // end the big token.
+  status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE);
+  if (status != XML_STATUS_OK) {
+    xml_failure(parser);
+  }
+
+  if (enabled) {
+    // In general, we may need to push more data to trigger a reparse attempt,
+    // but in this test, the data is constructed to always require it.
+    assert_true(testdata.count == 1); // or the test is incorrect
+    // 2x the token length should suffice; the +1 covers the start and end.
+    for (int c = 0; c < 101; ++c) {
+      status = XML_Parse(parser, spaces, fillsize, XML_FALSE);
+      if (status != XML_STATUS_OK) {
+        xml_failure(parser);
+      }
+    }
+  }
+  assert_true(testdata.count == 2); // the big token should be done
+
+  // parse the final text
+  status = XML_Parse(parser, post, (int)strlen(post), XML_TRUE);
+  if (status != XML_STATUS_OK) {
+    xml_failure(parser);
+  }
+  assert_true(testdata.count == 3); // after isFinal=XML_TRUE, all must be done
+
+  XML_ParserFree(parser);
+  return XML_STATUS_OK;
+}
+
+START_TEST(test_reparse_deferral_is_inherited) {
+  const char *const text
+      = "<!DOCTYPE document SYSTEM 'something.ext'><document/>";
+  for (int enabled = 0; enabled <= 1; ++enabled) {
+    set_subtest("deferral=%d", enabled);
+
+    XML_Parser parser = XML_ParserCreate(NULL);
+    assert_true(parser != NULL);
+    XML_SetUserData(parser, (void *)&enabled);
+    XML_SetParamEntityParsing(parser, XML_PARAM_ENTITY_PARSING_ALWAYS);
+    // this handler creates a sub-parser and checks that its deferral behavior
+    // is what we expected, based on the value of `enabled` (in userdata).
+    XML_SetExternalEntityRefHandler(parser, external_inherited_parser);
+    assert_true(XML_SetReparseDeferralEnabled(parser, enabled));
+    if (XML_Parse(parser, text, (int)strlen(text), XML_TRUE) != XML_STATUS_OK)
+      xml_failure(parser);
+
+    XML_ParserFree(parser);
+  }
+}
+END_TEST
+
+START_TEST(test_set_reparse_deferral_on_null_parser) {
+  assert_true(XML_SetReparseDeferralEnabled(NULL, 0) == XML_FALSE);
+  assert_true(XML_SetReparseDeferralEnabled(NULL, 1) == XML_FALSE);
+  assert_true(XML_SetReparseDeferralEnabled(NULL, 10) == XML_FALSE);
+  assert_true(XML_SetReparseDeferralEnabled(NULL, 100) == XML_FALSE);
+  assert_true(XML_SetReparseDeferralEnabled(NULL, (XML_Bool)INT_MIN)
+              == XML_FALSE);
+  assert_true(XML_SetReparseDeferralEnabled(NULL, (XML_Bool)INT_MAX)
+              == XML_FALSE);
+}
+END_TEST
+
+START_TEST(test_set_reparse_deferral_on_the_fly) {
+  const char *const pre = "<d><x attr='";
+  const char *const end = "'></x>";
+  char iiiiii[100];
+  const int fillsize = (int)sizeof(iiiiii);
+  memset(iiiiii, 'i', fillsize);
+
+  XML_Parser parser = XML_ParserCreate(NULL);
+  assert_true(parser != NULL);
+  assert_true(XML_SetReparseDeferralEnabled(parser, XML_TRUE));
+
+  CharData storage;
+  CharData_Init(&storage);
+  XML_SetUserData(parser, &storage);
+  XML_SetStartElementHandler(parser, start_element_event_handler);
+
+  enum XML_Status status;
+  // parse the start text
+  status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE);
+  if (status != XML_STATUS_OK) {
+    xml_failure(parser);
+  }
+  CharData_CheckXMLChars(&storage, XCS("d")); // first element should be done
+
+  // try to parse some 'i', but the token isn't finished
+  status = XML_Parse(parser, iiiiii, fillsize, XML_FALSE);
+  if (status != XML_STATUS_OK) {
+    xml_failure(parser);
+  }
+  CharData_CheckXMLChars(&storage, XCS("d")); // *still* just the first one
+
+  // end the <x> token.
+  status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE);
+  if (status != XML_STATUS_OK) {
+    xml_failure(parser);
+  }
+  CharData_CheckXMLChars(&storage, XCS("d")); // not yet.
+
+  // now change the heuristic setting and add *no* data
+  assert_true(XML_SetReparseDeferralEnabled(parser, XML_FALSE));
+  // we avoid isFinal=XML_TRUE, because that would force-bypass the heuristic.
+  status = XML_Parse(parser, "", 0, XML_FALSE);
+  if (status != XML_STATUS_OK) {
+    xml_failure(parser);
+  }
+  CharData_CheckXMLChars(&storage, XCS("dx"));
+
+  XML_ParserFree(parser);
+}
+END_TEST
+
+START_TEST(test_set_bad_reparse_option) {
+  XML_Parser parser = XML_ParserCreate(NULL);
+  assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 2));
+  assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 3));
+  assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 99));
+  assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 127));
+  assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 128));
+  assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 129));
+  assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 255));
+  assert_true(XML_TRUE == XML_SetReparseDeferralEnabled(parser, 0));
+  assert_true(XML_TRUE == XML_SetReparseDeferralEnabled(parser, 1));
+  XML_ParserFree(parser);
+}
+END_TEST
+
+static size_t g_totalAlloc = 0;
+static size_t g_biggestAlloc = 0;
+
+static void *
+counting_realloc(void *ptr, size_t size) {
+  g_totalAlloc += size;
+  if (size > g_biggestAlloc) {
+    g_biggestAlloc = size;
+  }
+  return realloc(ptr, size);
+}
+
+static void *
+counting_malloc(size_t size) {
+  return counting_realloc(NULL, size);
+}
+
+START_TEST(test_bypass_heuristic_when_close_to_bufsize) {
+  if (g_chunkSize != 0) {
+    // this test does not use SINGLE_BYTES, because it depends on very precise
+    // buffer fills.
+    return;
+  }
+  if (! g_reparseDeferralEnabledDefault) {
+    return; // this test is irrelevant when the deferral heuristic is disabled.
+  }
+
+  const int document_length = 65536;
+  char *const document = (char *)malloc(document_length);
+
+  const XML_Memory_Handling_Suite memfuncs = {
+      counting_malloc,
+      counting_realloc,
+      free,
+  };
+
+  const int leading_list[] = {0, 3, 61, 96, 400, 401, 4000, 4010, 4099, -1};
+  const int bigtoken_list[] = {3000, 4000, 4001, 4096, 4099, 5000, 20000, -1};
+  const int fillsize_list[] = {131, 256, 399, 400, 401, 1025, 4099, 4321, -1};
+
+  for (const int *leading = leading_list; *leading >= 0; leading++) {
+    for (const int *bigtoken = bigtoken_list; *bigtoken >= 0; bigtoken++) {
+      for (const int *fillsize = fillsize_list; *fillsize >= 0; fillsize++) {
+        set_subtest("leading=%d bigtoken=%d fillsize=%d", *leading, *bigtoken,
+                    *fillsize);
+        // start by checking that the test looks reasonably valid
+        assert_true(*leading + *bigtoken <= document_length);
+
+        // put 'x' everywhere; some will be overwritten by elements.
+        memset(document, 'x', document_length);
+        // maybe add an initial tag
+        if (*leading) {
+          assert_true(*leading >= 3); // or the test case is invalid
+          memcpy(document, "<a>", 3);
+        }
+        // add the large token
+        document[*leading + 0] = '<';
+        document[*leading + 1] = 'b';
+        memset(&document[*leading + 2], ' ', *bigtoken - 2); // a spacy token
+        document[*leading + *bigtoken - 1] = '>';
+
+        // 1 for 'b', plus 1 or 0 depending on the presence of 'a'
+        const int expected_elem_total = 1 + (*leading ? 1 : 0);
+
+        XML_Parser parser = XML_ParserCreate_MM(NULL, &memfuncs, NULL);
+        assert_true(parser != NULL);
+
+        CharData storage;
+        CharData_Init(&storage);
+        XML_SetUserData(parser, &storage);
+        XML_SetStartElementHandler(parser, start_element_event_handler);
+
+        g_biggestAlloc = 0;
+        g_totalAlloc = 0;
+        int offset = 0;
+        // fill data until the big token is covered (but not necessarily parsed)
+        while (offset < *leading + *bigtoken) {
+          assert_true(offset + *fillsize <= document_length);
+          const enum XML_Status status
+              = XML_Parse(parser, &document[offset], *fillsize, XML_FALSE);
+          if (status != XML_STATUS_OK) {
+            xml_failure(parser);
+          }
+          offset += *fillsize;
+        }
+        // Now, check that we've had a buffer allocation that could fit the
+        // context bytes and our big token. In order to detect a special case,
+        // we need to know how many bytes of our big token were included in the
+        // first push that contained _any_ bytes of the big token:
+        const int bigtok_first_chunk_bytes = *fillsize - (*leading % *fillsize);
+        if (bigtok_first_chunk_bytes >= *bigtoken && XML_CONTEXT_BYTES == 0) {
+          // Special case: we aren't saving any context, and the whole big token
+          // was covered by a single fill, so Expat may have parsed directly
+          // from our input pointer, without allocating an internal buffer.
+        } else if (*leading < XML_CONTEXT_BYTES) {
+          assert_true(g_biggestAlloc >= *leading + (size_t)*bigtoken);
+        } else {
+          assert_true(g_biggestAlloc >= XML_CONTEXT_BYTES + (size_t)*bigtoken);
+        }
+        // fill data until the big token is actually parsed
+        while (storage.count < expected_elem_total) {
+          const size_t alloc_before = g_totalAlloc;
+          assert_true(offset + *fillsize <= document_length);
+          const enum XML_Status status
+              = XML_Parse(parser, &document[offset], *fillsize, XML_FALSE);
+          if (status != XML_STATUS_OK) {
+            xml_failure(parser);
+          }
+          offset += *fillsize;
+          // since all the bytes of the big token are already in the buffer,
+          // the bufsize ceiling should make us finish its parsing without any
+          // further buffer allocations. We assume that there will be no other
+          // large allocations in this test.
+          assert_true(g_totalAlloc - alloc_before < 4096);
+        }
+        // test-the-test: was our alloc even called?
+        assert_true(g_totalAlloc > 0);
+        // test-the-test: there shouldn't be any extra start elements
+        assert_true(storage.count == expected_elem_total);
+
+        XML_ParserFree(parser);
+      }
+    }
+  }
+  free(document);
+}
+END_TEST
+
+START_TEST(test_varying_buffer_fills) {
+  const int KiB = 1024;
+  const int MiB = 1024 * KiB;
+  const int document_length = 16 * MiB;
+  const int big = 7654321; // arbitrarily chosen between 4 and 8 MiB
+
+  if (g_chunkSize != 0) {
+    return; // this test is slow, and doesn't use _XML_Parse_SINGLE_BYTES().
+  }
+
+  char *const document = (char *)malloc(document_length);
+  assert_true(document != NULL);
+  memset(document, 'x', document_length);
+  document[0] = '<';
+  document[1] = 't';
+  memset(&document[2], ' ', big - 2); // a very spacy token
+  document[big - 1] = '>';
+
+  // Each testcase is a list of buffer fill sizes, terminated by a value < 0.
+  // When reparse deferral is enabled, the final (negated) value is the expected
+  // maximum number of bytes scanned in parse attempts.
+  const int testcases[][30] = {
+      {8 * MiB, -8 * MiB},
+      {4 * MiB, 4 * MiB, -12 * MiB}, // try at 4MB, then 8MB = 12 MB total
+      // zero-size fills shouldn't trigger the bypass
+      {4 * MiB, 0, 4 * MiB, -12 * MiB},
+      {4 * MiB, 0, 0, 4 * MiB, -12 * MiB},
+      {4 * MiB, 0, 1 * MiB, 0, 3 * MiB, -12 * MiB},
+      // try to hit the buffer ceiling only once (at the end)
+      {4 * MiB, 2 * MiB, 1 * MiB, 512 * KiB, 256 * KiB, 256 * KiB, -12 * MiB},
+      // try to hit the same buffer ceiling multiple times
+      {4 * MiB + 1, 2 * MiB, 1 * MiB, 512 * KiB, -25 * MiB},
+
+      // try to hit every ceiling, by always landing 1K shy of the buffer size
+      {1 * KiB, 2 * KiB, 4 * KiB, 8 * KiB, 16 * KiB, 32 * KiB, 64 * KiB,
+       128 * KiB, 256 * KiB, 512 * KiB, 1 * MiB, 2 * MiB, 4 * MiB, -16 * MiB},
+
+      // try to avoid every ceiling, by always landing 1B past the buffer size
+      // the normal 2x heuristic threshold still forces parse attempts.
+      {2 * KiB + 1,          // will attempt 2KiB + 1 ==> total 2KiB + 1
+       2 * KiB, 4 * KiB,     // will attempt 8KiB + 1 ==> total 10KiB + 2
+       8 * KiB, 16 * KiB,    // will attempt 32KiB + 1 ==> total 42KiB + 3
+       32 * KiB, 64 * KiB,   // will attempt 128KiB + 1 ==> total 170KiB + 4
+       128 * KiB, 256 * KiB, // will attempt 512KiB + 1 ==> total 682KiB + 5
+       512 * KiB, 1 * MiB,   // will attempt 2MiB + 1 ==> total 2M + 682K + 6
+       2 * MiB, 4 * MiB,     // will attempt 8MiB + 1 ==> total 10M + 682K + 7
+       -(10 * MiB + 682 * KiB + 7)},
+      // try to avoid every ceiling again, except on our last fill.
+      {2 * KiB + 1,          // will attempt 2KiB + 1 ==> total 2KiB + 1
+       2 * KiB, 4 * KiB,     // will attempt 8KiB + 1 ==> total 10KiB + 2
+       8 * KiB, 16 * KiB,    // will attempt 32KiB + 1 ==> total 42KiB + 3
+       32 * KiB, 64 * KiB,   // will attempt 128KiB + 1 ==> total 170KiB + 4
+       128 * KiB, 256 * KiB, // will attempt 512KiB + 1 ==> total 682KiB + 5
+       512 * KiB, 1 * MiB,   // will attempt 2MiB + 1 ==> total 2M + 682K + 6
+       2 * MiB, 4 * MiB - 1, // will attempt 8MiB ==> total 10M + 682K + 6
+       -(10 * MiB + 682 * KiB + 6)},
+
+      // try to hit ceilings on the way multiple times
+      {512 * KiB + 1, 256 * KiB, 128 * KiB, 128 * KiB - 1, // 1 MiB buffer
+       512 * KiB + 1, 256 * KiB, 128 * KiB, 128 * KiB - 1, // 2 MiB buffer
+       1 * MiB + 1, 512 * KiB, 256 * KiB, 256 * KiB - 1,   // 4 MiB buffer
+       2 * MiB + 1, 1 * MiB, 512 * KiB,                    // 8 MiB buffer
+       // we'll make a parse attempt at every parse call
+       -(45 * MiB + 12)},
+  };
+  const int testcount = sizeof(testcases) / sizeof(testcases[0]);
+  for (int test_i = 0; test_i < testcount; test_i++) {
+    const int *fillsize = testcases[test_i];
+    set_subtest("#%d {%d %d %d %d ...}", test_i, fillsize[0], fillsize[1],
+                fillsize[2], fillsize[3]);
+    XML_Parser parser = XML_ParserCreate(NULL);
+    assert_true(parser != NULL);
+    g_parseAttempts = 0;
+
+    CharData storage;
+    CharData_Init(&storage);
+    XML_SetUserData(parser, &storage);
+    XML_SetStartElementHandler(parser, start_element_event_handler);
+
+    int worstcase_bytes = 0; // sum of (buffered bytes at each XML_Parse call)
+    int scanned_bytes = 0;   // sum of (buffered bytes at each actual parse)
+    int offset = 0;
+    while (*fillsize >= 0) {
+      assert_true(offset + *fillsize <= document_length); // or test is invalid
+      const unsigned attempts_before = g_parseAttempts;
+      const enum XML_Status status
+          = XML_Parse(parser, &document[offset], *fillsize, XML_FALSE);
+      if (status != XML_STATUS_OK) {
+        xml_failure(parser);
+      }
+      offset += *fillsize;
+      fillsize++;
+      assert_true(offset <= INT_MAX - worstcase_bytes); // avoid overflow
+      worstcase_bytes += offset; // we might've tried to parse all pending bytes
+      if (g_parseAttempts != attempts_before) {
+        assert_true(g_parseAttempts == attempts_before + 1); // max 1/XML_Parse
+        assert_true(offset <= INT_MAX - scanned_bytes);      // avoid overflow
+        scanned_bytes += offset; // we *did* try to parse all pending bytes
+      }
+    }
+    assert_true(storage.count == 1); // the big token should've been parsed
+    assert_true(scanned_bytes > 0);  // test-the-test: does our counter work?
+    if (g_reparseDeferralEnabledDefault) {
+      // heuristic is enabled; some XML_Parse calls may have deferred reparsing
+      const int max_bytes_scanned = -*fillsize;
+      if (scanned_bytes > max_bytes_scanned) {
+        fprintf(stderr,
+                "bytes scanned in parse attempts: actual=%d limit=%d \n",
+                scanned_bytes, max_bytes_scanned);
+        fail("too many bytes scanned in parse attempts");
+      }
+      assert_true(scanned_bytes <= worstcase_bytes);
+    } else {
+      // heuristic is disabled; every XML_Parse() will have reparsed
+      assert_true(scanned_bytes == worstcase_bytes);
+    }
+
+    XML_ParserFree(parser);
+  }
+  free(document);
+}
+END_TEST
+
 void
 make_basic_test_case(Suite *s) {
  TCase *tc_basic = tcase_create("basic tests");
@ -5299,6 +5945,7 @@ make_basic_test_case(Suite *s) {
  tcase_add_test(tc_basic, test_get_buffer_3_overflow);
 #endif
  tcase_add_test(tc_basic, test_buffer_can_grow_to_max);
+  tcase_add_test(tc_basic, test_getbuffer_allocates_on_zero_len);
  tcase_add_test(tc_basic, test_byte_info_at_end);
  tcase_add_test(tc_basic, test_byte_info_at_error);
  tcase_add_test(tc_basic, test_byte_info_at_cdata);
@ -5417,4 +6064,12 @@ make_basic_test_case(Suite *s) {
  tcase_add_test__ifdef_xml_dtd(tc_basic,
                                test_pool_integrity_with_unfinished_attr);
  tcase_add_test__if_xml_ge(tc_basic, test_nested_entity_suspend);
+  tcase_add_test(tc_basic, test_big_tokens_take_linear_time);
+  tcase_add_test(tc_basic, test_set_reparse_deferral);
+  tcase_add_test(tc_basic, test_reparse_deferral_is_inherited);
+  tcase_add_test(tc_basic, test_set_reparse_deferral_on_null_parser);
+  tcase_add_test(tc_basic, test_set_reparse_deferral_on_the_fly);
+  tcase_add_test(tc_basic, test_set_bad_reparse_option);
+  tcase_add_test(tc_basic, test_bypass_heuristic_when_close_to_bufsize);
+  tcase_add_test(tc_basic, test_varying_buffer_fills);
 }
--- a/expat/tests/common.c
+++ b/expat/tests/common.c
@ -185,7 +185,7 @@ _xml_failure(XML_Parser parser, const char *file, int line) {
           "u, offset %" XML_FMT_INT_MOD "u)\n    reported from %s, line %d\n",
           err, XML_ErrorString(err), XML_GetCurrentLineNumber(parser),
           XML_GetCurrentColumnNumber(parser), file, line);
-  _assert_true(0, file, line, buffer);
+  _fail(file, line, buffer);
 }

 enum XML_Status
@ -214,9 +214,9 @@ _expect_failure(const char *text, enum XML_Error errorCode,
                const char *errorMessage, const char *file, int lineno) {
  if (_XML_Parse_SINGLE_BYTES(g_parser, text, (int)strlen(text), XML_TRUE)
      == XML_STATUS_OK)
-    /* Hackish use of _assert_true() macro, but let's us report
+    /* Hackish use of _fail() macro, but lets us report
       the right filename and line number. */
-    _assert_true(0, file, lineno, errorMessage);
+    _fail(file, lineno, errorMessage);
  if (XML_GetErrorCode(g_parser) != errorCode)
    _xml_failure(g_parser, file, lineno);
 }
--- a/expat/tests/handlers.c
+++ b/expat/tests/handlers.c
@ -1717,7 +1717,9 @@ record_element_end_handler(void *userData, const XML_Char *name) {
 const struct handler_record_entry *
 _handler_record_get(const struct handler_record_list *storage, int index,
                    const char *file, int line) {
-  _assert_true(storage->count > index, file, line, "too few handler calls");
+  if (storage->count <= index) {
+    _fail(file, line, "too few handler calls");
+  }
  return &storage->entries[index];
 }

--- a/expat/tests/minicheck.c
+++ b/expat/tests/minicheck.c
@ -244,14 +244,11 @@ srunner_summarize(SRunner *runner, int verbosity) {
 }

 void
-_assert_true(int condition, const char *file, int line, const char *msg) {
+_fail(const char *file, int line, const char *msg) {
  /* Always print the error message so it isn't lost.  In this case,
     we have a failure, so there's no reason to be quiet about what
     it is.
  */
-  if (condition) {
-    return;
-  }
  _check_current_filename = file;
  _check_current_lineno = line;
  if (msg != NULL) {
--- a/expat/tests/minicheck.h
+++ b/expat/tests/minicheck.h
@ -83,9 +83,13 @@ extern "C" {

 void PRINTF_LIKE(1, 2) set_subtest(char const *fmt, ...);

-#  define fail(msg) _assert_true(0, __FILE__, __LINE__, msg)
+#  define fail(msg) _fail(__FILE__, __LINE__, msg)
 #  define assert_true(cond)                                                    \
-    _assert_true((cond), __FILE__, __LINE__, "check failed: " #cond)
+    do {                                                                       \
+      if (! (cond)) {                                                          \
+        _fail(__FILE__, __LINE__, "check failed: " #cond);                     \
+      }                                                                        \
+    } while (0)

 typedef void (*tcase_setup_function)(void);
 typedef void (*tcase_teardown_function)(void);
@ -124,7 +128,11 @@ void _check_set_test_info(char const *function, char const *filename,
 * Prototypes for the actual implementation.
 */

-void _assert_true(int condition, const char *file, int line, const char *msg);
+#  if defined(__GNUC__)
+__attribute__((noreturn))
+#  endif
+void
+_fail(const char *file, int line, const char *msg);
 Suite *suite_create(const char *name);
 TCase *tcase_create(const char *name);
 void suite_add_tcase(Suite *suite, TCase *tc);
--- a/expat/tests/runtests.c
+++ b/expat/tests/runtests.c
@ -98,10 +98,14 @@ main(int argc, char *argv[]) {
    printf("Expat version: %" XML_FMT_STR "\n", XML_ExpatVersion());

  for (g_chunkSize = 0; g_chunkSize <= 5; g_chunkSize++) {
-    char context[100];
-    snprintf(context, sizeof(context), "chunksize=%d", g_chunkSize);
-    context[sizeof(context) - 1] = '\0';
-    srunner_run_all(sr, context, verbosity);
+    for (int enabled = 0; enabled <= 1; ++enabled) {
+      char context[100];
+      g_reparseDeferralEnabledDefault = enabled;
+      snprintf(context, sizeof(context), "chunksize=%d deferral=%d",
+               g_chunkSize, enabled);
+      context[sizeof(context) - 1] = '\0';
+      srunner_run_all(sr, context, verbosity);
+    }
  }
  srunner_summarize(sr, verbosity);
  nf = srunner_ntests_failed(sr);
--- a/expat/xmlwf/xmlwf.c
+++ b/expat/xmlwf/xmlwf.c
@ -918,6 +918,9 @@ usage(const XML_Char *prog, int rc) {
      T("  -a FACTOR      set maximum tolerated [a]mplification factor (default: 100.0)\n")
      T("  -b BYTES       set number of output [b]ytes needed to activate (default: 8 MiB)\n")
      T("\n")
+      T("reparse deferral:\n")
+      T("  -q             disable reparse deferral, and allow [q]uadratic parse runtime with large tokens\n")
+      T("\n")
      T("info arguments:\n")
      T("  -h, --help     show this [h]elp message and exit\n")
      T("  -v, --version  show program's [v]ersion number and exit\n")
@ -973,6 +976,8 @@ tmain(int argc, XML_Char **argv) {
  unsigned long long attackThresholdBytes = 0;
  XML_Bool attackThresholdGiven = XML_FALSE;

+  XML_Bool disableDeferral = XML_FALSE;
+
  int exitCode = XMLWF_EXIT_SUCCESS;
  enum XML_ParamEntityParsing paramEntityParsing
      = XML_PARAM_ENTITY_PARSING_NEVER;
@ -1125,6 +1130,11 @@ tmain(int argc, XML_Char **argv) {
 #endif
      break;
    }
+    case T('q'): {
+      disableDeferral = XML_TRUE;
+      j++;
+      break;
+    }
    case T('\0'):
      if (j > 1) {
        i++;
@ -1171,6 +1181,16 @@ tmain(int argc, XML_Char **argv) {
 #endif
    }

+    if (disableDeferral) {
+      const XML_Bool success = XML_SetReparseDeferralEnabled(parser, XML_FALSE);
+      if (! success) {
+        // This prevents tperror(..) from reporting misleading "[..]: Success"
+        errno = EINVAL;
+        tperror(T("Failed to disable reparse deferral"));
+        exit(XMLWF_EXIT_INTERNAL_ERROR);
+      }
+    }
+
    if (requireStandalone)
      XML_SetNotStandaloneHandler(parser, notStandalone);
    XML_SetParamEntityParsing(parser, paramEntityParsing);
--- a/expat/xmlwf/xmlwf_helpgen.py
+++ b/expat/xmlwf/xmlwf_helpgen.py
@ -82,6 +82,10 @@ billion_laughs.add_argument('-a', metavar='FACTOR',
                            help='set maximum tolerated [a]mplification factor (default: 100.0)')
 billion_laughs.add_argument('-b', metavar='BYTES', help='set number of output [b]ytes needed to activate (default: 8 MiB)')

+reparse_deferral = parser.add_argument_group('reparse deferral')
+reparse_deferral.add_argument('-q', metavar='FACTOR',
+                            help='disable reparse deferral, and allow [q]uadratic parse runtime with large tokens')
+
 parser.add_argument('files', metavar='FILE', nargs='*', help='file to process (default: STDIN)')

 info = parser.add_argument_group('info arguments')
--- a/testdata/largefiles/README.txt
+++ b/testdata/largefiles/README.txt
@ -31,4 +31,27 @@ resulting measurements tell us.)
    utility, specifically for testing the duplicate attribute check in
    storeAttributes()

+* aaaaaa_attr.xml (~10 MB):
+  - properties: trivial file with a huge attribute value
+  - source: generated by a simple shell script
+  - purpose: performance/regression test

+* aaaaaa_cdata.xml (~10 MB):
+  - properties: trivial file with huge cdata content
+  - source: generated by a simple shell script
+  - purpose: performance/regression test
+
+* aaaaaa_comment.xml (~10 MB):
+  - properties: trivial file with a huge comment
+  - source: generated by a simple shell script
+  - purpose: performance/regression test
+
+* aaaaaa_tag.xml (~10 MB):
+  - properties: trivial file with a huge tag name
+  - source: generated by a simple shell script
+  - purpose: performance/regression test
+
+* aaaaaa_text.xml (~10 MB):
+  - properties: trivial file with a huge text segment (no newlines)
+  - source: generated by a simple shell script
+  - purpose: performance/regression test
--- a/testdata/largefiles/aaaaaa_attr.xml
+++ b/testdata/largefiles/aaaaaa_attr.xml
--- a/testdata/largefiles/aaaaaa_cdata.xml
+++ b/testdata/largefiles/aaaaaa_cdata.xml
--- a/testdata/largefiles/aaaaaa_comment.xml
+++ b/testdata/largefiles/aaaaaa_comment.xml
--- a/testdata/largefiles/aaaaaa_tag.xml
+++ b/testdata/largefiles/aaaaaa_tag.xml
--- a/testdata/largefiles/aaaaaa_text.xml
+++ b/testdata/largefiles/aaaaaa_text.xml