From 58864b4fcc1b6e22bf009ad854fcac6e5e107ad4 Mon Sep 17 00:00:00 2001
From: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
Date: Sun, 21 Apr 2024 12:11:19 -0400
Subject: [PATCH 1/2] Add check for unicode normalization in config

This ensures that the config sections match Wikipedia's Unicode
normalization. We could also normalize every section in every article to
handle an edge case, but I don't think that's worth it yet.

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
---
 Cargo.lock  |  5 +++--
 Cargo.toml  |  5 ++++-
 src/html.rs | 51 ++++++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 53 insertions(+), 8 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 40b7ffa..ad954f6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -719,6 +719,7 @@ dependencies = [
  "thiserror",
  "tracing",
  "tracing-subscriber",
+ "unicode-normalization",
  "url",
  "urlencoding",
 ]
@@ -1478,9 +1479,9 @@ checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
 
 [[package]]
 name = "unicode-normalization"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
 dependencies = [
  "tinyvec",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 22abbcf..3baec8f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,6 @@ anyhow = { version = "1.0.71", features = ["backtrace"] }
 clap = { version = "4.3.2", features = ["derive"] }
 csv = "1.2.2"
 ego-tree = "0.6.2"
-expect-test = "1.4.1"
 html5ever = "0.26.0"
 log = "0.4.18"
 markup5ever = "0.11.0"
@@ -29,6 +28,10 @@ tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
 url = "2.3.1"
 urlencoding = "2.1.2"
 
+[dev-dependencies]
+expect-test = "1.4.1"
+unicode-normalization = "0.1.23"
+
 [profile.release]
 overflow-checks = true
 lto = true
diff --git a/src/html.rs b/src/html.rs
index 4315a72..30c6001 100644
--- a/src/html.rs
+++ b/src/html.rs
@@ -23,12 +23,23 @@ struct Config<'a> {
     sections_to_remove: BTreeMap<&'a str, BTreeSet<&'a str>>,
 }
 
+/// Path to the processing config file.
+///
+/// Other compile-time macros expect a string literal, so this must be a macro instead of a const str.
+macro_rules! config_path {
+    () => {
+        concat!(
+            env!("CARGO_MANIFEST_DIR"),
+            "/article_processing_config.json"
+        )
+    };
+}
+
 static CONFIG: Lazy<Config<'static>> = Lazy::new(|| {
-    serde_json::from_str(include_str!(concat!(
-        env!("CARGO_MANIFEST_DIR"),
-        "/article_processing_config.json"
-    )))
-    .expect("\"article_processing_config.json\" is either invalid json or the wrong structure")
+    serde_json::from_str(include_str!(config_path!())).expect(concat!(
+        config_path!(),
+        " is either invalid json or the wrong structure"
+    ))
 });
 
 static HEADERS: Lazy<Selector> =
@@ -488,6 +499,36 @@ mod test {
         assert!(!CONFIG.sections_to_remove.is_empty());
     }
 
+    /// Ensure config sections match Wikipedia's Unicode normalization (NFC) so
+    /// that they can be correctly compared bytewise.
+    ///
+    /// As the discussion below mentions, there is an edge-case where section
+    /// names in the article contain templates that expand to non-normalized
+    /// text, which this does not handle.
+    ///
+    /// See also:
+    /// - Mediawiki discussion of normalization: https://mediawiki.org/wiki/Unicode_normalization_considerations
+    /// - Online conversion tool: https://util.unicode.org/UnicodeJsps/transform.jsp?a=Any-NFC
+    #[test]
+    fn static_config_sections_are_normalized() {
+        use unicode_normalization::{is_nfc, UnicodeNormalization};
+
+        let mut all_sections_are_normalized = true;
+        for section in CONFIG.sections_to_remove.values().flatten() {
+            if !is_nfc(section) {
+                all_sections_are_normalized = false;
+                let normalized = String::from_iter(section.nfc());
+                eprintln!("Section to remove {section:?} should be normalized to {normalized:?}");
+            }
+        }
+
+        assert!(
+            all_sections_are_normalized,
+            "Not all sections in {} are in Unicode NFC. Please replace the reported sections.",
+            config_path!()
+        );
+    }
+
     fn expand_links(document: &mut Html) {
         let links: Vec<_> = document
             .select(&Selector::parse("a").unwrap())
-- 
2.45.3


From 62a16d2da7db12abd7d04743cf8342687330f29d Mon Sep 17 00:00:00 2001
From: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
Date: Wed, 24 Apr 2024 10:22:42 -0400
Subject: [PATCH 2/2] Add caveats to section removal doc

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
---
 src/html.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/html.rs b/src/html.rs
index 30c6001..56d309f 100644
--- a/src/html.rs
+++ b/src/html.rs
@@ -214,6 +214,9 @@ fn remove_ids(document: &mut Html, ids: impl IntoIterator<Item = NodeId>) {
 }
 
 /// Remove sections with the specified `titles` and all trailing elements until next section.
+///
+/// `titles` are matched by case-sensitive simple byte comparison.
+/// `titles` should be normalized to Unicode NFC to match Wikipedia's internal normalization: <https://mediawiki.org/wiki/Unicode_normalization_considerations>.
 fn remove_sections(document: &mut Html, titles: &BTreeSet<&str>) {
     let mut to_remove = Vec::new();
 
@@ -507,6 +510,7 @@ mod test {
     /// text, which this does not handle.
     ///
     /// See also:
+    /// - [super::remove_sections]
     /// - Mediawiki discussion of normalization: https://mediawiki.org/wiki/Unicode_normalization_considerations
     /// - Online conversion tool: https://util.unicode.org/UnicodeJsps/transform.jsp?a=Any-NFC
     #[test]
-- 
2.45.3