From cd03fed762a7caaf3334be65387cd2244c2bd697 Mon Sep 17 00:00:00 2001
From: Evan Lloyd New-Schmidt <newsch@users.noreply.github.com>
Date: Wed, 24 Apr 2024 10:34:11 -0400
Subject: [PATCH] Add check for unicode normalization in config (#44)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This ensures that the config sections match Wikipedia's Unicode
normalization. We could also normalize every section in every article to
handle an edge case where non-normalized output is inadvertently created
as tags are joined, but I don't think that's worth it yet.

From <https://mediawiki.org/wiki/Unicode_normalization_considerations>:

> MediaWiki applies normalization form C (NFC) to Unicode text input.

> MediaWiki doesn't apply any normalization to its output, for example
> `cafe<nowiki/>́` becomes "café" (shows U+0065 U+0301 in a row,
> without precomposed characters like U+00E9 appearing).

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
---
 Cargo.lock  |  5 +++--
 Cargo.toml  |  5 ++++-
 src/html.rs | 55 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 57 insertions(+), 8 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 40b7ffa..ad954f6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -719,6 +719,7 @@ dependencies = [
  "thiserror",
  "tracing",
  "tracing-subscriber",
+ "unicode-normalization",
  "url",
  "urlencoding",
 ]
@@ -1478,9 +1479,9 @@ checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
 
 [[package]]
 name = "unicode-normalization"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
 dependencies = [
  "tinyvec",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 22abbcf..3baec8f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,6 @@ anyhow = { version = "1.0.71", features = ["backtrace"] }
 clap = { version = "4.3.2", features = ["derive"] }
 csv = "1.2.2"
 ego-tree = "0.6.2"
-expect-test = "1.4.1"
 html5ever = "0.26.0"
 log = "0.4.18"
 markup5ever = "0.11.0"
@@ -29,6 +28,10 @@ tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
 url = "2.3.1"
 urlencoding = "2.1.2"
 
+[dev-dependencies]
+expect-test = "1.4.1"
+unicode-normalization = "0.1.23"
+
 [profile.release]
 overflow-checks = true
 lto = true
diff --git a/src/html.rs b/src/html.rs
index 4315a72..56d309f 100644
--- a/src/html.rs
+++ b/src/html.rs
@@ -23,12 +23,23 @@ struct Config<'a> {
     sections_to_remove: BTreeMap<&'a str, BTreeSet<&'a str>>,
 }
 
+/// Path to the processing config file.
+///
+/// Other compile-time macros expect a string literal, so this must be a macro instead of a const str.
+macro_rules! config_path {
+    () => {
+        concat!(
+            env!("CARGO_MANIFEST_DIR"),
+            "/article_processing_config.json"
+        )
+    };
+}
+
 static CONFIG: Lazy<Config<'static>> = Lazy::new(|| {
-    serde_json::from_str(include_str!(concat!(
-        env!("CARGO_MANIFEST_DIR"),
-        "/article_processing_config.json"
-    )))
-    .expect("\"article_processing_config.json\" is either invalid json or the wrong structure")
+    serde_json::from_str(include_str!(config_path!())).expect(concat!(
+        config_path!(),
+        " is either invalid json or the wrong structure"
+    ))
 });
 
 static HEADERS: Lazy<Selector> =
@@ -203,6 +214,9 @@ fn remove_ids(document: &mut Html, ids: impl IntoIterator<Item = NodeId>) {
 }
 
 /// Remove sections with the specified `titles` and all trailing elements until next section.
+///
+/// `titles` are matched by case-sensitive simple byte comparison.
+/// `titles` should be normalized to Unicode NFC to match Wikipedia's internal normalization: <https://mediawiki.org/wiki/Unicode_normalization_considerations>.
 fn remove_sections(document: &mut Html, titles: &BTreeSet<&str>) {
     let mut to_remove = Vec::new();
 
@@ -488,6 +502,37 @@ mod test {
         assert!(!CONFIG.sections_to_remove.is_empty());
     }
 
+    /// Ensure config sections match Wikipedia's Unicode normalization (NFC) so
+    /// that they can be correctly compared bytewise.
+    ///
+    /// As the discussion below mentions, there is an edge-case where section
+    /// names in the article contain templates that expand to non-normalized
+    /// text, which this does not handle.
+    ///
+    /// See also:
+    /// - [super::remove_sections]
+    /// - Mediawiki discussion of normalization: https://mediawiki.org/wiki/Unicode_normalization_considerations
+    /// - Online conversion tool: https://util.unicode.org/UnicodeJsps/transform.jsp?a=Any-NFC
+    #[test]
+    fn static_config_sections_are_normalized() {
+        use unicode_normalization::{is_nfc, UnicodeNormalization};
+
+        let mut all_sections_are_normalized = true;
+        for section in CONFIG.sections_to_remove.values().flatten() {
+            if !is_nfc(section) {
+                all_sections_are_normalized = false;
+                let normalized = String::from_iter(section.nfc());
+                eprintln!("Section to remove {section:?} should be normalized to {normalized:?}");
+            }
+        }
+
+        assert!(
+            all_sections_are_normalized,
+            "Not all sections in {} are in Unicode NFC. Please replace the reported sections.",
+            config_path!()
+        );
+    }
+
     fn expand_links(document: &mut Html) {
         let links: Vec<_> = document
             .select(&Selector::parse("a").unwrap())