From 7d453d5e633b5b92810811cb4eacba9309ea172a Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Wed, 4 Oct 2023 16:20:50 -0400 Subject: [PATCH] Reorganize html module Signed-off-by: Evan Lloyd New-Schmidt --- benches/html_processing.rs | 4 +-- src/get_articles.rs | 8 +++--- src/html.rs | 50 ++++++++++++++++++++++++++------------ src/main.rs | 2 +- tests/html.rs | 15 ++++++------ 5 files changed, 49 insertions(+), 30 deletions(-) diff --git a/benches/html_processing.rs b/benches/html_processing.rs index 0e4f3e0..1d5b73e 100644 --- a/benches/html_processing.rs +++ b/benches/html_processing.rs @@ -12,9 +12,9 @@ fn process_crimean_mountains(b: &mut Bencher) { let text = include_str!("../tests/data/Q4185820-en/original.html"); // process lazy statics beforehand - black_box(html::simplify(text, "en")); + black_box(html::process_str(text, "en").unwrap()); b.iter(|| { - black_box(html::simplify(text, "en")); + black_box(html::process_str(text, "en").unwrap()); }); } diff --git a/src/get_articles.rs b/src/get_articles.rs index ab67b2f..b77bcad 100644 --- a/src/get_articles.rs +++ b/src/get_articles.rs @@ -270,8 +270,10 @@ fn write( redirects: impl IntoIterator, simplify: bool, ) -> anyhow::Result<()> { - let html = if simplify { - match html::simplify(&page.article_body.html, &page.in_language.identifier) { + let html = if !simplify { + page.article_body.html.to_string() + } else { + match html::process_str(&page.article_body.html, &page.in_language.identifier) { Ok(html) => html, Err(HtmlError::Panic(msg)) => { // Write original article text to disk @@ -293,8 +295,6 @@ fn write( } Err(e) => bail!(e), } - } else { - page.article_body.html.to_string() }; let article_dir = create_article_dir(&base, page, redirects)?; diff --git a/src/html.rs b/src/html.rs index 4e3853f..4315a72 100644 --- a/src/html.rs +++ b/src/html.rs @@ -86,17 +86,24 @@ static ELEMENT_DENY_LIST: Lazy = Lazy::new(|| { .unwrap() }); -pub fn simplify(html: &str, lang: &str) -> Result { +/// Convenience wrapper around [[process]]. +pub fn process_str(html: &str, lang: &str) -> Result { + let document = Html::parse_document(html); + let document = process(document, lang)?; + Ok(document.html()) +} + +/// Simplify an article, checking for bad pages and failures. +pub fn process(mut document: Html, lang: &str) -> Result { panic::catch_unwind(|| { - let mut document = Html::parse_document(html); if let Some(redirect) = detect_redirect(&document) { return Err(HtmlError::Redirect(redirect.to_owned())); } - simplify_html(&mut document, lang); + simplify(&mut document, lang); if !has_text(&document) { return Err(HtmlError::NoText); } - Ok(document.html()) + Ok(document) }) .map_err(PanicMsg::new)? } @@ -144,6 +151,7 @@ pub fn detect_lang(document: &Html) -> Option { }) } +/// Check if the html contains any non-whitespace text nodes. pub fn has_text(document: &Html) -> bool { if let Some(root) = ElementRef::wrap(document.tree.root()) { !is_empty_or_whitespace(&root) @@ -157,7 +165,16 @@ pub fn has_text(document: &Html) -> bool { } } -pub fn simplify_html(document: &mut Html, lang: &str) { +/// Simplify an article to only basic text. +/// +/// # Panics +/// +/// This modifies the HTML tree in a way that violates some assumptions of the underlying +/// `scraper` and `ego-tree` crates and cause panics. +/// +/// If this is undesirable, see [[process]] for a higher-level wrapper that +/// handles panics and other errors. +pub fn simplify(document: &mut Html, lang: &str) { if let Some(titles) = CONFIG.sections_to_remove.get(lang) { remove_sections(document, titles); } @@ -419,6 +436,18 @@ fn expand_id(document: &mut Html, id: NodeId) { node.detach(); } +#[derive(Debug, PartialEq, thiserror::Error)] +pub enum HtmlError { + /// Processing this HTML caused a panic in an underlying library + #[error("panicked while processing html")] + Panic(#[from] PanicMsg), + #[error("page is redirect stub for {0:?}")] + Redirect(String), + #[error("page has no text after processing")] + NoText, +} + +/// Error wrapper around panic payloads that handles static and formatted messages. #[derive(Debug, PartialEq)] pub struct PanicMsg(Cow<'static, str>); @@ -450,17 +479,6 @@ impl Deref for PanicMsg { } } -#[derive(Debug, PartialEq, thiserror::Error)] -pub enum HtmlError { - /// Processing this HTML caused a panic in an underlying library - #[error("panicked while processing html")] - Panic(#[from] PanicMsg), - #[error("page is redirect stub for {0:?}")] - Redirect(String), - #[error("page has no text after processing")] - NoText, -} - #[cfg(test)] mod test { use super::*; diff --git a/src/main.rs b/src/main.rs index b28d7fb..7286ee1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -174,7 +174,7 @@ fn main() -> anyhow::Result<()> { stdin().read_to_string(&mut input)?; let start = Instant::now(); - let output = om_wikiparser::html::simplify(&input, &lang)?; + let output = om_wikiparser::html::process_str(&input, &lang)?; let stop = Instant::now(); let time = stop.duration_since(start); diff --git a/tests/html.rs b/tests/html.rs index 27b1e48..fdef3f5 100644 --- a/tests/html.rs +++ b/tests/html.rs @@ -3,14 +3,15 @@ //! To update the expected output, run the test again with the env variable //! `UPDATE_EXPECT=1` set. //! See https://docs.rs/expect-test/ for more information. -use om_wikiparser::html::{pretty_print, simplify, simplify_html, HtmlError}; +use om_wikiparser::html::{detect_lang, pretty_print, process, process_str, HtmlError}; use expect_test::{expect_file, ExpectFile}; use scraper::Html; fn check(input: &str, expect: ExpectFile) { - let mut html = Html::parse_document(input); - simplify_html(&mut html, "en"); + let html = Html::parse_document(input); + let lang = detect_lang(&html).unwrap(); + let html = process(html, &lang).unwrap(); let processed = pretty_print(&html); expect.assert_eq(&processed); @@ -35,13 +36,13 @@ fn simplify_thoor_ballylee() { #[test] fn not_redirect_crimean_mountains() { let article = include_str!("./data/Q748282-en/original.html"); - assert!(simplify(article, "en").is_ok()); + assert!(process_str(article, "en").is_ok()); } #[test] fn not_redirect_thoor_ballylee() { let article = include_str!("./data/Q4185820-en/original.html"); - assert!(simplify(article, "en").is_ok()); + assert!(process_str(article, "en").is_ok()); } #[test] @@ -49,12 +50,12 @@ fn is_redirect_abdalcık_aşkale() { let article = include_str!("./data/redirects/Abdalc%C4%B1k%2C%20A%C5%9Fkale.html"); assert_eq!( Err(HtmlError::Redirect("Aşkale".into())), - simplify(article, "en") + process_str(article, "en") ); } #[test] fn is_empty_bahnstrecke_bassum_herford() { let article = include_str!("./data/redirects/Bahnstrecke%20Bassum%FF%FF%FFHerford.html"); - assert_eq!(Err(HtmlError::NoText), simplify(article, "en")); + assert_eq!(Err(HtmlError::NoText), process_str(article, "en")); }