Move more encoding tests out of runtests.c

2025-04-07 14:09:26 +00:00 · 2022-10-28 22:04:25 +01:00 · 2022-10-28 22:04:25 +01:00 · eef3e9f4c2
commit eef3e9f4c2
parent 6933957fb6
4 changed files with 224 additions and 218 deletions
--- a/expat/tests/basic_tests.c
+++ b/expat/tests/basic_tests.c
@ -4232,6 +4232,183 @@ START_TEST(test_ext_entity_utf16_le) {
 }
 END_TEST

+/* Test little-endian UTF-16 given no explicit encoding.
+ * The existing default encoding (UTF-8) is assumed to hold without a
+ * BOM to contradict it, so the entity value will in fact provoke an
+ * error because 0x00 is not a valid XML character.  We parse the
+ * whole buffer in one go rather than feeding it in byte by byte to
+ * exercise different code paths in the initial scanning routines.
+ */
+START_TEST(test_ext_entity_utf16_unknown) {
+  const char *text = "<!DOCTYPE doc [\n"
+                     "  <!ENTITY en SYSTEM 'http://example.org/dummy.ent'>\n"
+                     "]>\n"
+                     "<doc>&en;</doc>";
+  ExtFaults2 test_data
+      = {"a\0b\0c\0", 6, "Invalid character in entity not faulted", NULL,
+         XML_ERROR_INVALID_TOKEN};
+
+  XML_SetExternalEntityRefHandler(g_parser, external_entity_faulter2);
+  XML_SetUserData(g_parser, &test_data);
+  expect_failure(text, XML_ERROR_EXTERNAL_ENTITY_HANDLING,
+                 "Invalid character should not have been accepted");
+}
+END_TEST
+
+/* Test not-quite-UTF-8 BOM (0xEF 0xBB 0xBF) */
+START_TEST(test_ext_entity_utf8_non_bom) {
+  const char *text = "<!DOCTYPE doc [\n"
+                     "  <!ENTITY en SYSTEM 'http://example.org/dummy.ent'>\n"
+                     "]>\n"
+                     "<doc>&en;</doc>";
+  ExtTest2 test_data
+      = {"\xef\xbb\x80", /* Arabic letter DAD medial form, U+FEC0 */
+         3, NULL, NULL, EE_PARSE_NONE};
+#ifdef XML_UNICODE
+  const XML_Char *expected = XCS("\xfec0");
+#else
+  const XML_Char *expected = XCS("\xef\xbb\x80");
+#endif
+  CharData storage;
+
+  CharData_Init(&storage);
+  test_data.storage = &storage;
+  XML_SetExternalEntityRefHandler(g_parser, external_entity_loader2);
+  XML_SetUserData(g_parser, &test_data);
+  XML_SetCharacterDataHandler(g_parser, ext2_accumulate_characters);
+  if (_XML_Parse_SINGLE_BYTES(g_parser, text, (int)strlen(text), XML_TRUE)
+      == XML_STATUS_ERROR)
+    xml_failure(g_parser);
+  CharData_CheckXMLChars(&storage, expected);
+}
+END_TEST
+
+/* Test that UTF-8 in a CDATA section is correctly passed through */
+START_TEST(test_utf8_in_cdata_section) {
+  const char *text = "<doc><![CDATA[one \xc3\xa9 two]]></doc>";
+#ifdef XML_UNICODE
+  const XML_Char *expected = XCS("one \x00e9 two");
+#else
+  const XML_Char *expected = XCS("one \xc3\xa9 two");
+#endif
+
+  run_character_check(text, expected);
+}
+END_TEST
+
+/* Test that little-endian UTF-16 in a CDATA section is handled */
+START_TEST(test_utf8_in_cdata_section_2) {
+  const char *text = "<doc><![CDATA[\xc3\xa9]\xc3\xa9two]]></doc>";
+#ifdef XML_UNICODE
+  const XML_Char *expected = XCS("\x00e9]\x00e9two");
+#else
+  const XML_Char *expected = XCS("\xc3\xa9]\xc3\xa9two");
+#endif
+
+  run_character_check(text, expected);
+}
+END_TEST
+
+START_TEST(test_utf8_in_start_tags) {
+  struct test_case {
+    bool goodName;
+    bool goodNameStart;
+    const char *tagName;
+  };
+
+  // The idea with the tests below is this:
+  // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
+  // go to isNever and are hence not a concern.
+  //
+  // We start with a character that is a valid name character
+  // (or even name-start character, see XML 1.0r4 spec) and then we flip
+  // single bits at places where (1) the result leaves the UTF-8 encoding space
+  // and (2) we stay in the same n-byte sequence family.
+  //
+  // The flipped bits are highlighted in angle brackets in comments,
+  // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
+  // the most significant bit to 1 to leave UTF-8 encoding space.
+  struct test_case cases[] = {
+      // 1-byte UTF-8: [0xxx xxxx]
+      {true, true, "\x3A"},   // [0011 1010] = ASCII colon ':'
+      {false, false, "\xBA"}, // [<1>011 1010]
+      {true, false, "\x39"},  // [0011 1001] = ASCII nine '9'
+      {false, false, "\xB9"}, // [<1>011 1001]
+
+      // 2-byte UTF-8: [110x xxxx] [10xx xxxx]
+      {true, true, "\xDB\xA5"},   // [1101 1011] [1010 0101] =
+                                  // Arabic small waw U+06E5
+      {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
+      {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
+      {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
+      {true, false, "\xCC\x81"},  // [1100 1100] [1000 0001] =
+                                  // combining char U+0301
+      {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
+      {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
+      {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
+
+      // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
+      {true, true, "\xE0\xA4\x85"},   // [1110 0000] [1010 0100] [1000 0101] =
+                                      // Devanagari Letter A U+0905
+      {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
+      {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
+      {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
+      {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
+      {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
+      {true, false, "\xE0\xA4\x81"},  // [1110 0000] [1010 0100] [1000 0001] =
+                                      // combining char U+0901
+      {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
+      {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
+      {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
+      {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
+      {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
+  };
+  const bool atNameStart[] = {true, false};
+
+  size_t i = 0;
+  char doc[1024];
+  size_t failCount = 0;
+
+  for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
+    size_t j = 0;
+    for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
+      const bool expectedSuccess
+          = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
+      snprintf(doc, sizeof(doc), "<%s%s><!--", atNameStart[j] ? "" : "a",
+               cases[i].tagName);
+      XML_Parser parser = XML_ParserCreate(NULL);
+
+      const enum XML_Status status
+          = XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE);
+
+      bool success = true;
+      if ((status == XML_STATUS_OK) != expectedSuccess) {
+        success = false;
+      }
+      if ((status == XML_STATUS_ERROR)
+          && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
+        success = false;
+      }
+
+      if (! success) {
+        fprintf(
+            stderr,
+            "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
+            (unsigned)i + 1u, atNameStart[j] ? "    " : "not ",
+            (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
+        failCount++;
+      }
+
+      XML_ParserFree(parser);
+    }
+  }
+
+  if (failCount > 0) {
+    fail("UTF-8 regression detected");
+  }
+}
+END_TEST
+
 TCase *
 make_basic_test_case(Suite *s) {
  TCase *tc_basic = tcase_create("basic tests");
@ -4432,6 +4609,11 @@ make_basic_test_case(Suite *s) {
  tcase_add_test(tc_basic, test_ext_entity_latin1_utf16be_bom2);
  tcase_add_test(tc_basic, test_ext_entity_utf16_be);
  tcase_add_test(tc_basic, test_ext_entity_utf16_le);
+  tcase_add_test(tc_basic, test_ext_entity_utf16_unknown);
+  tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom);
+  tcase_add_test(tc_basic, test_utf8_in_cdata_section);
+  tcase_add_test(tc_basic, test_utf8_in_cdata_section_2);
+  tcase_add_test(tc_basic, test_utf8_in_start_tags);

  return tc_basic; /* TEMPORARY: this will become a void function */
 }
--- a/expat/tests/handlers.c
+++ b/expat/tests/handlers.c
@ -1043,6 +1043,34 @@ external_entity_loader2(XML_Parser parser, const XML_Char *context,
  return XML_STATUS_OK;
 }

+int XMLCALL
+external_entity_faulter2(XML_Parser parser, const XML_Char *context,
+                         const XML_Char *base, const XML_Char *systemId,
+                         const XML_Char *publicId) {
+  ExtFaults2 *test_data = (ExtFaults2 *)XML_GetUserData(parser);
+  XML_Parser extparser;
+
+  UNUSED_P(base);
+  UNUSED_P(systemId);
+  UNUSED_P(publicId);
+  extparser = XML_ExternalEntityParserCreate(parser, context, NULL);
+  if (extparser == NULL)
+    fail("Could not create external entity parser");
+  if (test_data->encoding != NULL) {
+    if (! XML_SetEncoding(extparser, test_data->encoding))
+      fail("XML_SetEncoding() ignored for external entity");
+  }
+  if (XML_Parse(extparser, test_data->parse_text, test_data->parse_len,
+                XML_TRUE)
+      != XML_STATUS_ERROR)
+    fail(test_data->fail_text);
+  if (XML_GetErrorCode(extparser) != test_data->error)
+    xml_failure(extparser);
+
+  XML_ParserFree(extparser);
+  return XML_STATUS_ERROR;
+}
+
 /* NotStandalone handlers */

 int XMLCALL
--- a/expat/tests/handlers.h
+++ b/expat/tests/handlers.h
@ -302,6 +302,20 @@ extern int XMLCALL external_entity_loader2(XML_Parser parser,
                                           const XML_Char *systemId,
                                           const XML_Char *publicId);

+typedef struct ExtFaults2 {
+  const char *parse_text;
+  int parse_len;
+  const char *fail_text;
+  const XML_Char *encoding;
+  enum XML_Error error;
+} ExtFaults2;
+
+extern int XMLCALL external_entity_faulter2(XML_Parser parser,
+                                            const XML_Char *context,
+                                            const XML_Char *base,
+                                            const XML_Char *systemId,
+                                            const XML_Char *publicId);
+
 /* NotStandalone handlers */

 extern int XMLCALL reject_not_standalone_handler(void *userData);
--- a/expat/tests/runtests.c
+++ b/expat/tests/runtests.c
@ -74,219 +74,6 @@

 XML_Parser g_parser = NULL;

-/* Test little-endian UTF-16 given no explicit encoding.
- * The existing default encoding (UTF-8) is assumed to hold without a
- * BOM to contradict it, so the entity value will in fact provoke an
- * error because 0x00 is not a valid XML character.  We parse the
- * whole buffer in one go rather than feeding it in byte by byte to
- * exercise different code paths in the initial scanning routines.
- */
-typedef struct ExtFaults2 {
-  const char *parse_text;
-  int parse_len;
-  const char *fail_text;
-  const XML_Char *encoding;
-  enum XML_Error error;
-} ExtFaults2;
-
-static int XMLCALL
-external_entity_faulter2(XML_Parser parser, const XML_Char *context,
-                         const XML_Char *base, const XML_Char *systemId,
-                         const XML_Char *publicId) {
-  ExtFaults2 *test_data = (ExtFaults2 *)XML_GetUserData(parser);
-  XML_Parser extparser;
-
-  UNUSED_P(base);
-  UNUSED_P(systemId);
-  UNUSED_P(publicId);
-  extparser = XML_ExternalEntityParserCreate(parser, context, NULL);
-  if (extparser == NULL)
-    fail("Could not create external entity parser");
-  if (test_data->encoding != NULL) {
-    if (! XML_SetEncoding(extparser, test_data->encoding))
-      fail("XML_SetEncoding() ignored for external entity");
-  }
-  if (XML_Parse(extparser, test_data->parse_text, test_data->parse_len,
-                XML_TRUE)
-      != XML_STATUS_ERROR)
-    fail(test_data->fail_text);
-  if (XML_GetErrorCode(extparser) != test_data->error)
-    xml_failure(extparser);
-
-  XML_ParserFree(extparser);
-  return XML_STATUS_ERROR;
-}
-
-START_TEST(test_ext_entity_utf16_unknown) {
-  const char *text = "<!DOCTYPE doc [\n"
-                     "  <!ENTITY en SYSTEM 'http://example.org/dummy.ent'>\n"
-                     "]>\n"
-                     "<doc>&en;</doc>";
-  ExtFaults2 test_data
-      = {"a\0b\0c\0", 6, "Invalid character in entity not faulted", NULL,
-         XML_ERROR_INVALID_TOKEN};
-
-  XML_SetExternalEntityRefHandler(g_parser, external_entity_faulter2);
-  XML_SetUserData(g_parser, &test_data);
-  expect_failure(text, XML_ERROR_EXTERNAL_ENTITY_HANDLING,
-                 "Invalid character should not have been accepted");
-}
-END_TEST
-
-/* Test not-quite-UTF-8 BOM (0xEF 0xBB 0xBF) */
-START_TEST(test_ext_entity_utf8_non_bom) {
-  const char *text = "<!DOCTYPE doc [\n"
-                     "  <!ENTITY en SYSTEM 'http://example.org/dummy.ent'>\n"
-                     "]>\n"
-                     "<doc>&en;</doc>";
-  ExtTest2 test_data
-      = {"\xef\xbb\x80", /* Arabic letter DAD medial form, U+FEC0 */
-         3, NULL, NULL, EE_PARSE_NONE};
-#ifdef XML_UNICODE
-  const XML_Char *expected = XCS("\xfec0");
-#else
-  const XML_Char *expected = XCS("\xef\xbb\x80");
-#endif
-  CharData storage;
-
-  CharData_Init(&storage);
-  test_data.storage = &storage;
-  XML_SetExternalEntityRefHandler(g_parser, external_entity_loader2);
-  XML_SetUserData(g_parser, &test_data);
-  XML_SetCharacterDataHandler(g_parser, ext2_accumulate_characters);
-  if (_XML_Parse_SINGLE_BYTES(g_parser, text, (int)strlen(text), XML_TRUE)
-      == XML_STATUS_ERROR)
-    xml_failure(g_parser);
-  CharData_CheckXMLChars(&storage, expected);
-}
-END_TEST
-
-/* Test that UTF-8 in a CDATA section is correctly passed through */
-START_TEST(test_utf8_in_cdata_section) {
-  const char *text = "<doc><![CDATA[one \xc3\xa9 two]]></doc>";
-#ifdef XML_UNICODE
-  const XML_Char *expected = XCS("one \x00e9 two");
-#else
-  const XML_Char *expected = XCS("one \xc3\xa9 two");
-#endif
-
-  run_character_check(text, expected);
-}
-END_TEST
-
-/* Test that little-endian UTF-16 in a CDATA section is handled */
-START_TEST(test_utf8_in_cdata_section_2) {
-  const char *text = "<doc><![CDATA[\xc3\xa9]\xc3\xa9two]]></doc>";
-#ifdef XML_UNICODE
-  const XML_Char *expected = XCS("\x00e9]\x00e9two");
-#else
-  const XML_Char *expected = XCS("\xc3\xa9]\xc3\xa9two");
-#endif
-
-  run_character_check(text, expected);
-}
-END_TEST
-
-START_TEST(test_utf8_in_start_tags) {
-  struct test_case {
-    bool goodName;
-    bool goodNameStart;
-    const char *tagName;
-  };
-
-  // The idea with the tests below is this:
-  // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
-  // go to isNever and are hence not a concern.
-  //
-  // We start with a character that is a valid name character
-  // (or even name-start character, see XML 1.0r4 spec) and then we flip
-  // single bits at places where (1) the result leaves the UTF-8 encoding space
-  // and (2) we stay in the same n-byte sequence family.
-  //
-  // The flipped bits are highlighted in angle brackets in comments,
-  // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
-  // the most significant bit to 1 to leave UTF-8 encoding space.
-  struct test_case cases[] = {
-      // 1-byte UTF-8: [0xxx xxxx]
-      {true, true, "\x3A"},   // [0011 1010] = ASCII colon ':'
-      {false, false, "\xBA"}, // [<1>011 1010]
-      {true, false, "\x39"},  // [0011 1001] = ASCII nine '9'
-      {false, false, "\xB9"}, // [<1>011 1001]
-
-      // 2-byte UTF-8: [110x xxxx] [10xx xxxx]
-      {true, true, "\xDB\xA5"},   // [1101 1011] [1010 0101] =
-                                  // Arabic small waw U+06E5
-      {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
-      {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
-      {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
-      {true, false, "\xCC\x81"},  // [1100 1100] [1000 0001] =
-                                  // combining char U+0301
-      {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
-      {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
-      {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
-
-      // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
-      {true, true, "\xE0\xA4\x85"},   // [1110 0000] [1010 0100] [1000 0101] =
-                                      // Devanagari Letter A U+0905
-      {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
-      {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
-      {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
-      {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
-      {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
-      {true, false, "\xE0\xA4\x81"},  // [1110 0000] [1010 0100] [1000 0001] =
-                                      // combining char U+0901
-      {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
-      {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
-      {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
-      {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
-      {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
-  };
-  const bool atNameStart[] = {true, false};
-
-  size_t i = 0;
-  char doc[1024];
-  size_t failCount = 0;
-
-  for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
-    size_t j = 0;
-    for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
-      const bool expectedSuccess
-          = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
-      snprintf(doc, sizeof(doc), "<%s%s><!--", atNameStart[j] ? "" : "a",
-               cases[i].tagName);
-      XML_Parser parser = XML_ParserCreate(NULL);
-
-      const enum XML_Status status
-          = XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE);
-
-      bool success = true;
-      if ((status == XML_STATUS_OK) != expectedSuccess) {
-        success = false;
-      }
-      if ((status == XML_STATUS_ERROR)
-          && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
-        success = false;
-      }
-
-      if (! success) {
-        fprintf(
-            stderr,
-            "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
-            (unsigned)i + 1u, atNameStart[j] ? "    " : "not ",
-            (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
-        failCount++;
-      }
-
-      XML_ParserFree(parser);
-    }
-  }
-
-  if (failCount > 0) {
-    fail("UTF-8 regression detected");
-  }
-}
-END_TEST
-
 /* Test trailing spaces in elements are accepted */
 static void XMLCALL
 record_element_end_handler(void *userData, const XML_Char *name) {
@ -6135,11 +5922,6 @@ make_suite(void) {
  TCase *tc_accounting = tcase_create("accounting tests");
 #endif

-  tcase_add_test(tc_basic, test_ext_entity_utf16_unknown);
-  tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom);
-  tcase_add_test(tc_basic, test_utf8_in_cdata_section);
-  tcase_add_test(tc_basic, test_utf8_in_cdata_section_2);
-  tcase_add_test(tc_basic, test_utf8_in_start_tags);
  tcase_add_test(tc_basic, test_trailing_spaces_in_elements);
  tcase_add_test(tc_basic, test_utf16_attribute);
  tcase_add_test(tc_basic, test_utf16_second_attr);