From 58864b4fcc1b6e22bf009ad854fcac6e5e107ad4 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Sun, 21 Apr 2024 12:11:19 -0400 Subject: [PATCH 1/2] Add check for unicode normalization in config This ensures that the config sections match Wikipedia's Unicode normalization. We could also normalize every section in every article to handle an edge case, but I don't think that's worth it yet. Signed-off-by: Evan Lloyd New-Schmidt --- Cargo.lock | 5 +++-- Cargo.toml | 5 ++++- src/html.rs | 51 ++++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 40b7ffa..ad954f6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -719,6 +719,7 @@ dependencies = [ "thiserror", "tracing", "tracing-subscriber", + "unicode-normalization", "url", "urlencoding", ] @@ -1478,9 +1479,9 @@ checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" [[package]] name = "unicode-normalization" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" dependencies = [ "tinyvec", ] diff --git a/Cargo.toml b/Cargo.toml index 22abbcf..3baec8f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,6 @@ anyhow = { version = "1.0.71", features = ["backtrace"] } clap = { version = "4.3.2", features = ["derive"] } csv = "1.2.2" ego-tree = "0.6.2" -expect-test = "1.4.1" html5ever = "0.26.0" log = "0.4.18" markup5ever = "0.11.0" @@ -29,6 +28,10 @@ tracing-subscriber = { version = "0.3.17", features = ["env-filter"] } url = "2.3.1" urlencoding = "2.1.2" +[dev-dependencies] +expect-test = "1.4.1" +unicode-normalization = "0.1.23" + [profile.release] overflow-checks = true lto = true diff --git a/src/html.rs b/src/html.rs index 4315a72..30c6001 100644 --- a/src/html.rs +++ b/src/html.rs @@ -23,12 +23,23 @@ struct Config<'a> { sections_to_remove: BTreeMap<&'a str, BTreeSet<&'a str>>, } +/// Path to the processing config file. +/// +/// Other compile-time macros expect a string literal, so this must be a macro instead of a const str. +macro_rules! config_path { + () => { + concat!( + env!("CARGO_MANIFEST_DIR"), + "/article_processing_config.json" + ) + }; +} + static CONFIG: Lazy> = Lazy::new(|| { - serde_json::from_str(include_str!(concat!( - env!("CARGO_MANIFEST_DIR"), - "/article_processing_config.json" - ))) - .expect("\"article_processing_config.json\" is either invalid json or the wrong structure") + serde_json::from_str(include_str!(config_path!())).expect(concat!( + config_path!(), + " is either invalid json or the wrong structure" + )) }); static HEADERS: Lazy = @@ -488,6 +499,36 @@ mod test { assert!(!CONFIG.sections_to_remove.is_empty()); } + /// Ensure config sections match Wikipedia's Unicode normalization (NFC) so + /// that they can be correctly compared bytewise. + /// + /// As the discussion below mentions, there is an edge-case where section + /// names in the article contain templates that expand to non-normalized + /// text, which this does not handle. + /// + /// See also: + /// - Mediawiki discussion of normalization: https://mediawiki.org/wiki/Unicode_normalization_considerations + /// - Online conversion tool: https://util.unicode.org/UnicodeJsps/transform.jsp?a=Any-NFC + #[test] + fn static_config_sections_are_normalized() { + use unicode_normalization::{is_nfc, UnicodeNormalization}; + + let mut all_sections_are_normalized = true; + for section in CONFIG.sections_to_remove.values().flatten() { + if !is_nfc(section) { + all_sections_are_normalized = false; + let normalized = String::from_iter(section.nfc()); + eprintln!("Section to remove {section:?} should be normalized to {normalized:?}"); + } + } + + assert!( + all_sections_are_normalized, + "Not all sections in {} are in Unicode NFC. Please replace the reported sections.", + config_path!() + ); + } + fn expand_links(document: &mut Html) { let links: Vec<_> = document .select(&Selector::parse("a").unwrap()) -- 2.45.3 From 62a16d2da7db12abd7d04743cf8342687330f29d Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 24 Apr 2024 10:22:42 -0400 Subject: [PATCH 2/2] Add caveats to section removal doc Signed-off-by: Evan Lloyd New-Schmidt --- src/html.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/html.rs b/src/html.rs index 30c6001..56d309f 100644 --- a/src/html.rs +++ b/src/html.rs @@ -214,6 +214,9 @@ fn remove_ids(document: &mut Html, ids: impl IntoIterator) { } /// Remove sections with the specified `titles` and all trailing elements until next section. +/// +/// `titles` are matched by case-sensitive simple byte comparison. +/// `titles` should be normalized to Unicode NFC to match Wikipedia's internal normalization: . fn remove_sections(document: &mut Html, titles: &BTreeSet<&str>) { let mut to_remove = Vec::new(); @@ -507,6 +510,7 @@ mod test { /// text, which this does not handle. /// /// See also: + /// - [super::remove_sections] /// - Mediawiki discussion of normalization: https://mediawiki.org/wiki/Unicode_normalization_considerations /// - Online conversion tool: https://util.unicode.org/UnicodeJsps/transform.jsp?a=Any-NFC #[test] -- 2.45.3