diff --git a/.github/workflows/data/exported-symbols.txt b/.github/workflows/data/exported-symbols.txt index 8fa1cef2..20c2a78b 100644 --- a/.github/workflows/data/exported-symbols.txt +++ b/.github/workflows/data/exported-symbols.txt @@ -53,6 +53,7 @@ XML_SetNotationDeclHandler XML_SetNotStandaloneHandler XML_SetParamEntityParsing XML_SetProcessingInstructionHandler +XML_SetReparseDeferralEnabled XML_SetReturnNSTriplet XML_SetSkippedEntityHandler XML_SetStartCdataSectionHandler diff --git a/expat/doc/reference.html b/expat/doc/reference.html index 2c943a58..899bb534 100644 --- a/expat/doc/reference.html +++ b/expat/doc/reference.html @@ -152,10 +152,11 @@ interface.

  • - Billion Laughs Attack Protection + Attack Protection
  • Miscellaneous Functions @@ -2167,11 +2168,7 @@ parse position may be before the beginning of the buffer.

    return NULL.

    -

    Billion Laughs Attack Protection

    - -

    The functions in this section configure the built-in - protection against various forms of - billion laughs attacks.

    +

    Attack Protection

    XML_SetBillionLaughsAttackProtectionMaximumAmplification

    @@ -2259,6 +2256,27 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold(XML_Parser p,
       

    +

    XML_SetReparseDeferralEnabled

    +
    +/* Added in Expat 2.6.0. */
    +XML_Bool XMLCALL
    +XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled);
    +
    +
    +

    + Large tokens may require many parse calls before enough data is available for Expat to parse it in full. + If Expat retried parsing the token on every parse call, parsing could take quadratic time. + To avoid this, Expat only retries once a significant amount of new data is available. + This function allows disabling this behavior. +

    +

    + The enabled argument should be XML_TRUE or XML_FALSE. +

    +

    + Returns XML_TRUE on success, and XML_FALSE on error. +

    +
    +

    Miscellaneous functions

    The functions in this section either obtain state information from diff --git a/expat/lib/expat.h b/expat/lib/expat.h index 4ea92439..aed625f0 100644 --- a/expat/lib/expat.h +++ b/expat/lib/expat.h @@ -16,6 +16,7 @@ Copyright (c) 2016 Thomas Beutlich Copyright (c) 2017 Rhodri James Copyright (c) 2022 Thijs Schreijer + Copyright (c) 2023 Sony Corporation / Snild Dolkow Licensed under the MIT license: Permission is hereby granted, free of charge, to any person obtaining @@ -1054,6 +1055,10 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold( XML_Parser parser, unsigned long long activationThresholdBytes); #endif +/* Added in Expat 2.6.0. */ +XMLPARSEAPI(XML_Bool) +XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled); + /* Expat follows the semantic versioning convention. See https://semver.org */ diff --git a/expat/lib/libexpat.def.cmake b/expat/lib/libexpat.def.cmake index 61a4f006..10ee9cd6 100644 --- a/expat/lib/libexpat.def.cmake +++ b/expat/lib/libexpat.def.cmake @@ -77,3 +77,5 @@ EXPORTS ; added with version 2.4.0 @_EXPAT_COMMENT_DTD_OR_GE@ XML_SetBillionLaughsAttackProtectionActivationThreshold @69 @_EXPAT_COMMENT_DTD_OR_GE@ XML_SetBillionLaughsAttackProtectionMaximumAmplification @70 +; added with version 2.6.0 + XML_SetReparseDeferralEnabled @71 diff --git a/expat/lib/xmlparse.c b/expat/lib/xmlparse.c index 749a85e8..e13ff6fb 100644 --- a/expat/lib/xmlparse.c +++ b/expat/lib/xmlparse.c @@ -651,6 +651,7 @@ struct XML_ParserStruct { XML_Index m_parseEndByteIndex; const char *m_parseEndPtr; size_t m_partialTokenBytesBefore; /* used in heuristic to avoid O(n^2) */ + XML_Bool m_reparseDeferralEnabled; XML_Char *m_dataBuf; XML_Char *m_dataBufEnd; XML_StartElementHandler m_startElementHandler; @@ -987,7 +988,7 @@ callProcessor(XML_Parser parser, const char *start, const char *end, const char **endPtr) { const size_t have_now = EXPAT_SAFE_PTR_DIFF(end, start); - if (g_reparseDeferralEnabledDefault + if (parser->m_reparseDeferralEnabled && ! parser->m_parsingStatus.finalBuffer) { // Heuristic: don't try to parse a partial token again until the amount of // available data has increased significantly. @@ -1193,6 +1194,7 @@ parserInit(XML_Parser parser, const XML_Char *encodingName) { parser->m_parseEndByteIndex = 0; parser->m_parseEndPtr = NULL; parser->m_partialTokenBytesBefore = 0; + parser->m_reparseDeferralEnabled = g_reparseDeferralEnabledDefault; parser->m_declElementType = NULL; parser->m_declAttributeId = NULL; parser->m_declEntity = NULL; @@ -2617,6 +2619,15 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold( } #endif /* XML_GE == 1 */ +XML_Bool XMLCALL +XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled) { + if (parser != NULL && (enabled == XML_TRUE || enabled == XML_FALSE)) { + parser->m_reparseDeferralEnabled = enabled; + return XML_TRUE; + } + return XML_FALSE; +} + /* Initially tag->rawName always points into the parse buffer; for those TAG instances opened while the current parse buffer was processed, and not yet closed, we need to store tag->rawName in a more diff --git a/expat/tests/basic_tests.c b/expat/tests/basic_tests.c index e95d837b..dd52ba3d 100644 --- a/expat/tests/basic_tests.c +++ b/expat/tests/basic_tests.c @@ -5304,6 +5304,154 @@ START_TEST(test_big_tokens_take_linear_time) { } END_TEST +START_TEST(test_set_reparse_deferral) { + const char *const pre = ""; + const char *const start = ""; + char eeeeee[100]; + const int fillsize = (int)sizeof(eeeeee); + memset(eeeeee, 'e', fillsize); + + for (int enabled = 0; enabled <= 1; enabled += 1) { + set_subtest("deferral=%d", enabled); + + XML_Parser parser = XML_ParserCreate(NULL); + assert_true(parser != NULL); + assert_true(XML_SetReparseDeferralEnabled(parser, enabled)); + + CharData storage; + CharData_Init(&storage); + XML_SetUserData(parser, &storage); + XML_SetStartElementHandler(parser, start_element_event_handler); + + enum XML_Status status; + // parse the start text + status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + CharData_CheckXMLChars(&storage, XCS("d")); // first element should be done + + // ..and the start of the token + status = XML_Parse(parser, start, (int)strlen(start), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + CharData_CheckXMLChars(&storage, XCS("d")); // still just the first one + + // try to parse lots of 'e', but the token isn't finished + for (int c = 0; c < 100; ++c) { + status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + } + CharData_CheckXMLChars(&storage, XCS("d")); // *still* just the first one + + // end the token. + status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + + if (enabled) { + // In general, we may need to push more data to trigger a reparse attempt, + // but in this test, the data is constructed to always require it. + CharData_CheckXMLChars(&storage, XCS("d")); // or the test is incorrect + // 2x the token length should suffice; the +1 covers the start and end. + for (int c = 0; c < 101; ++c) { + status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + } + } + CharData_CheckXMLChars(&storage, XCS("dx")); // the should be done + + XML_ParserFree(parser); + } +} +END_TEST + +START_TEST(test_set_reparse_deferral_on_null_parser) { + assert_true(XML_SetReparseDeferralEnabled(NULL, 0) == XML_FALSE); + assert_true(XML_SetReparseDeferralEnabled(NULL, 1) == XML_FALSE); + assert_true(XML_SetReparseDeferralEnabled(NULL, 10) == XML_FALSE); + assert_true(XML_SetReparseDeferralEnabled(NULL, 100) == XML_FALSE); + assert_true(XML_SetReparseDeferralEnabled(NULL, (XML_Bool)INT_MIN) + == XML_FALSE); + assert_true(XML_SetReparseDeferralEnabled(NULL, (XML_Bool)INT_MAX) + == XML_FALSE); +} +END_TEST + +START_TEST(test_set_reparse_deferral_on_the_fly) { + const char *const pre = " token. + status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + CharData_CheckXMLChars(&storage, XCS("d")); // not yet. + + // now change the heuristic setting and add *no* data + assert_true(XML_SetReparseDeferralEnabled(parser, XML_FALSE)); + // we avoid isFinal=XML_TRUE, because that would force-bypass the heuristic. + status = XML_Parse(parser, post, (int)strlen(post), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + CharData_CheckXMLChars(&storage, XCS("dx")); + + XML_ParserFree(parser); +} +END_TEST + +START_TEST(test_set_bad_reparse_option) { + XML_Parser parser = XML_ParserCreate(NULL); + assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 2)); + assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 3)); + assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 99)); + assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 127)); + assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 128)); + assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 129)); + assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 255)); + assert_true(XML_TRUE == XML_SetReparseDeferralEnabled(parser, 0)); + assert_true(XML_TRUE == XML_SetReparseDeferralEnabled(parser, 1)); + XML_ParserFree(parser); +} +END_TEST + void make_basic_test_case(Suite *s) { TCase *tc_basic = tcase_create("basic tests"); @@ -5545,4 +5693,8 @@ make_basic_test_case(Suite *s) { test_pool_integrity_with_unfinished_attr); tcase_add_test__if_xml_ge(tc_basic, test_nested_entity_suspend); tcase_add_test(tc_basic, test_big_tokens_take_linear_time); + tcase_add_test(tc_basic, test_set_reparse_deferral); + tcase_add_test(tc_basic, test_set_reparse_deferral_on_null_parser); + tcase_add_test(tc_basic, test_set_reparse_deferral_on_the_fly); + tcase_add_test(tc_basic, test_set_bad_reparse_option); }