From cd03fed762a7caaf3334be65387cd2244c2bd697 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 24 Apr 2024 10:34:11 -0400 Subject: [PATCH] Add check for unicode normalization in config (#44) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This ensures that the config sections match Wikipedia's Unicode normalization. We could also normalize every section in every article to handle an edge case where non-normalized output is inadvertently created as tags are joined, but I don't think that's worth it yet. From : > MediaWiki applies normalization form C (NFC) to Unicode text input. > MediaWiki doesn't apply any normalization to its output, for example > `café` becomes "café" (shows U+0065 U+0301 in a row, > without precomposed characters like U+00E9 appearing). Signed-off-by: Evan Lloyd New-Schmidt --- Cargo.lock | 5 +++-- Cargo.toml | 5 ++++- src/html.rs | 55 ++++++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 57 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 40b7ffa..ad954f6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -719,6 +719,7 @@ dependencies = [ "thiserror", "tracing", "tracing-subscriber", + "unicode-normalization", "url", "urlencoding", ] @@ -1478,9 +1479,9 @@ checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" [[package]] name = "unicode-normalization" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" dependencies = [ "tinyvec", ] diff --git a/Cargo.toml b/Cargo.toml index 22abbcf..3baec8f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,6 @@ anyhow = { version = "1.0.71", features = ["backtrace"] } clap = { version = "4.3.2", features = ["derive"] } csv = "1.2.2" ego-tree = "0.6.2" -expect-test = "1.4.1" html5ever = "0.26.0" log = "0.4.18" markup5ever = "0.11.0" @@ -29,6 +28,10 @@ tracing-subscriber = { version = "0.3.17", features = ["env-filter"] } url = "2.3.1" urlencoding = "2.1.2" +[dev-dependencies] +expect-test = "1.4.1" +unicode-normalization = "0.1.23" + [profile.release] overflow-checks = true lto = true diff --git a/src/html.rs b/src/html.rs index 4315a72..56d309f 100644 --- a/src/html.rs +++ b/src/html.rs @@ -23,12 +23,23 @@ struct Config<'a> { sections_to_remove: BTreeMap<&'a str, BTreeSet<&'a str>>, } +/// Path to the processing config file. +/// +/// Other compile-time macros expect a string literal, so this must be a macro instead of a const str. +macro_rules! config_path { + () => { + concat!( + env!("CARGO_MANIFEST_DIR"), + "/article_processing_config.json" + ) + }; +} + static CONFIG: Lazy> = Lazy::new(|| { - serde_json::from_str(include_str!(concat!( - env!("CARGO_MANIFEST_DIR"), - "/article_processing_config.json" - ))) - .expect("\"article_processing_config.json\" is either invalid json or the wrong structure") + serde_json::from_str(include_str!(config_path!())).expect(concat!( + config_path!(), + " is either invalid json or the wrong structure" + )) }); static HEADERS: Lazy = @@ -203,6 +214,9 @@ fn remove_ids(document: &mut Html, ids: impl IntoIterator) { } /// Remove sections with the specified `titles` and all trailing elements until next section. +/// +/// `titles` are matched by case-sensitive simple byte comparison. +/// `titles` should be normalized to Unicode NFC to match Wikipedia's internal normalization: . fn remove_sections(document: &mut Html, titles: &BTreeSet<&str>) { let mut to_remove = Vec::new(); @@ -488,6 +502,37 @@ mod test { assert!(!CONFIG.sections_to_remove.is_empty()); } + /// Ensure config sections match Wikipedia's Unicode normalization (NFC) so + /// that they can be correctly compared bytewise. + /// + /// As the discussion below mentions, there is an edge-case where section + /// names in the article contain templates that expand to non-normalized + /// text, which this does not handle. + /// + /// See also: + /// - [super::remove_sections] + /// - Mediawiki discussion of normalization: https://mediawiki.org/wiki/Unicode_normalization_considerations + /// - Online conversion tool: https://util.unicode.org/UnicodeJsps/transform.jsp?a=Any-NFC + #[test] + fn static_config_sections_are_normalized() { + use unicode_normalization::{is_nfc, UnicodeNormalization}; + + let mut all_sections_are_normalized = true; + for section in CONFIG.sections_to_remove.values().flatten() { + if !is_nfc(section) { + all_sections_are_normalized = false; + let normalized = String::from_iter(section.nfc()); + eprintln!("Section to remove {section:?} should be normalized to {normalized:?}"); + } + } + + assert!( + all_sections_are_normalized, + "Not all sections in {} are in Unicode NFC. Please replace the reported sections.", + config_path!() + ); + } + fn expand_links(document: &mut Html) { let links: Vec<_> = document .select(&Selector::parse("a").unwrap())