Add check for unicode normalization in config (#44)

This ensures that the config sections match Wikipedia's Unicode normalization. We could also normalize every section in every article to handle an edge case where non-normalized output is inadvertently created as tags are joined, but I don't think that's worth it yet. From <https://mediawiki.org/wiki/Unicode_normalization_considerations>: > MediaWiki applies normalization form C (NFC) to Unicode text input. > MediaWiki doesn't apply any normalization to its output, for example > `cafe<nowiki/>́` becomes "café" (shows U+0065 U+0301 in a row, > without precomposed characters like U+00E9 appearing). Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
2024-04-24 10:34:11 -04:00 · 2024-04-24 10:34:11 -04:00 · cd03fed762
commit cd03fed762
parent 19d9f2c42a
3 changed files with 57 additions and 8 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -719,6 +719,7 @@ dependencies = [
 "thiserror",
 "tracing",
 "tracing-subscriber",
+ "unicode-normalization",
 "url",
 "urlencoding",
 ]
@ -1478,9 +1479,9 @@ checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"

 [[package]]
 name = "unicode-normalization"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
 dependencies = [
 "tinyvec",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -13,7 +13,6 @@ anyhow = { version = "1.0.71", features = ["backtrace"] }
 clap = { version = "4.3.2", features = ["derive"] }
 csv = "1.2.2"
 ego-tree = "0.6.2"
-expect-test = "1.4.1"
 html5ever = "0.26.0"
 log = "0.4.18"
 markup5ever = "0.11.0"
@ -29,6 +28,10 @@ tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
 url = "2.3.1"
 urlencoding = "2.1.2"

+[dev-dependencies]
+expect-test = "1.4.1"
+unicode-normalization = "0.1.23"
+
 [profile.release]
 overflow-checks = true
 lto = true
--- a/src/html.rs
+++ b/src/html.rs
@ -23,12 +23,23 @@ struct Config<'a> {
    sections_to_remove: BTreeMap<&'a str, BTreeSet<&'a str>>,
 }

+/// Path to the processing config file.
+///
+/// Other compile-time macros expect a string literal, so this must be a macro instead of a const str.
+macro_rules! config_path {
+    () => {
+        concat!(
+            env!("CARGO_MANIFEST_DIR"),
+            "/article_processing_config.json"
+        )
+    };
+}
+
 static CONFIG: Lazy<Config<'static>> = Lazy::new(|| {
-    serde_json::from_str(include_str!(concat!(
-        env!("CARGO_MANIFEST_DIR"),
-        "/article_processing_config.json"
-    )))
-    .expect("\"article_processing_config.json\" is either invalid json or the wrong structure")
+    serde_json::from_str(include_str!(config_path!())).expect(concat!(
+        config_path!(),
+        " is either invalid json or the wrong structure"
+    ))
 });

 static HEADERS: Lazy<Selector> =
@ -203,6 +214,9 @@ fn remove_ids(document: &mut Html, ids: impl IntoIterator<Item = NodeId>) {
 }

 /// Remove sections with the specified `titles` and all trailing elements until next section.
+///
+/// `titles` are matched by case-sensitive simple byte comparison.
+/// `titles` should be normalized to Unicode NFC to match Wikipedia's internal normalization: <https://mediawiki.org/wiki/Unicode_normalization_considerations>.
 fn remove_sections(document: &mut Html, titles: &BTreeSet<&str>) {
    let mut to_remove = Vec::new();

@ -488,6 +502,37 @@ mod test {
        assert!(!CONFIG.sections_to_remove.is_empty());
    }

+    /// Ensure config sections match Wikipedia's Unicode normalization (NFC) so
+    /// that they can be correctly compared bytewise.
+    ///
+    /// As the discussion below mentions, there is an edge-case where section
+    /// names in the article contain templates that expand to non-normalized
+    /// text, which this does not handle.
+    ///
+    /// See also:
+    /// - [super::remove_sections]
+    /// - Mediawiki discussion of normalization: https://mediawiki.org/wiki/Unicode_normalization_considerations
+    /// - Online conversion tool: https://util.unicode.org/UnicodeJsps/transform.jsp?a=Any-NFC
+    #[test]
+    fn static_config_sections_are_normalized() {
+        use unicode_normalization::{is_nfc, UnicodeNormalization};
+
+        let mut all_sections_are_normalized = true;
+        for section in CONFIG.sections_to_remove.values().flatten() {
+            if !is_nfc(section) {
+                all_sections_are_normalized = false;
+                let normalized = String::from_iter(section.nfc());
+                eprintln!("Section to remove {section:?} should be normalized to {normalized:?}");
+            }
+        }
+
+        assert!(
+            all_sections_are_normalized,
+            "Not all sections in {} are in Unicode NFC. Please replace the reported sections.",
+            config_path!()
+        );
+    }
+
    fn expand_links(document: &mut Html) {
        let links: Vec<_> = document
            .select(&Selector::parse("a").unwrap())