Reorganize html module

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-10-04 16:20:50 -04:00 committed by Evan Lloyd New-Schmidt
parent b5f0b22f7a
commit 7d453d5e63
5 changed files with 49 additions and 30 deletions

View file

@ -12,9 +12,9 @@ fn process_crimean_mountains(b: &mut Bencher) {
let text = include_str!("../tests/data/Q4185820-en/original.html"); let text = include_str!("../tests/data/Q4185820-en/original.html");
// process lazy statics beforehand // process lazy statics beforehand
black_box(html::simplify(text, "en")); black_box(html::process_str(text, "en").unwrap());
b.iter(|| { b.iter(|| {
black_box(html::simplify(text, "en")); black_box(html::process_str(text, "en").unwrap());
}); });
} }

View file

@ -270,8 +270,10 @@ fn write(
redirects: impl IntoIterator<Item = Title>, redirects: impl IntoIterator<Item = Title>,
simplify: bool, simplify: bool,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let html = if simplify { let html = if !simplify {
match html::simplify(&page.article_body.html, &page.in_language.identifier) { page.article_body.html.to_string()
} else {
match html::process_str(&page.article_body.html, &page.in_language.identifier) {
Ok(html) => html, Ok(html) => html,
Err(HtmlError::Panic(msg)) => { Err(HtmlError::Panic(msg)) => {
// Write original article text to disk // Write original article text to disk
@ -293,8 +295,6 @@ fn write(
} }
Err(e) => bail!(e), Err(e) => bail!(e),
} }
} else {
page.article_body.html.to_string()
}; };
let article_dir = create_article_dir(&base, page, redirects)?; let article_dir = create_article_dir(&base, page, redirects)?;

View file

@ -86,17 +86,24 @@ static ELEMENT_DENY_LIST: Lazy<Selector> = Lazy::new(|| {
.unwrap() .unwrap()
}); });
pub fn simplify(html: &str, lang: &str) -> Result<String, HtmlError> { /// Convenience wrapper around [[process]].
pub fn process_str(html: &str, lang: &str) -> Result<String, HtmlError> {
let document = Html::parse_document(html);
let document = process(document, lang)?;
Ok(document.html())
}
/// Simplify an article, checking for bad pages and failures.
pub fn process(mut document: Html, lang: &str) -> Result<Html, HtmlError> {
panic::catch_unwind(|| { panic::catch_unwind(|| {
let mut document = Html::parse_document(html);
if let Some(redirect) = detect_redirect(&document) { if let Some(redirect) = detect_redirect(&document) {
return Err(HtmlError::Redirect(redirect.to_owned())); return Err(HtmlError::Redirect(redirect.to_owned()));
} }
simplify_html(&mut document, lang); simplify(&mut document, lang);
if !has_text(&document) { if !has_text(&document) {
return Err(HtmlError::NoText); return Err(HtmlError::NoText);
} }
Ok(document.html()) Ok(document)
}) })
.map_err(PanicMsg::new)? .map_err(PanicMsg::new)?
} }
@ -144,6 +151,7 @@ pub fn detect_lang(document: &Html) -> Option<String> {
}) })
} }
/// Check if the html contains any non-whitespace text nodes.
pub fn has_text(document: &Html) -> bool { pub fn has_text(document: &Html) -> bool {
if let Some(root) = ElementRef::wrap(document.tree.root()) { if let Some(root) = ElementRef::wrap(document.tree.root()) {
!is_empty_or_whitespace(&root) !is_empty_or_whitespace(&root)
@ -157,7 +165,16 @@ pub fn has_text(document: &Html) -> bool {
} }
} }
pub fn simplify_html(document: &mut Html, lang: &str) { /// Simplify an article to only basic text.
///
/// # Panics
///
/// This modifies the HTML tree in a way that violates some assumptions of the underlying
/// `scraper` and `ego-tree` crates and cause panics.
///
/// If this is undesirable, see [[process]] for a higher-level wrapper that
/// handles panics and other errors.
pub fn simplify(document: &mut Html, lang: &str) {
if let Some(titles) = CONFIG.sections_to_remove.get(lang) { if let Some(titles) = CONFIG.sections_to_remove.get(lang) {
remove_sections(document, titles); remove_sections(document, titles);
} }
@ -419,6 +436,18 @@ fn expand_id(document: &mut Html, id: NodeId) {
node.detach(); node.detach();
} }
#[derive(Debug, PartialEq, thiserror::Error)]
pub enum HtmlError {
/// Processing this HTML caused a panic in an underlying library
#[error("panicked while processing html")]
Panic(#[from] PanicMsg),
#[error("page is redirect stub for {0:?}")]
Redirect(String),
#[error("page has no text after processing")]
NoText,
}
/// Error wrapper around panic payloads that handles static and formatted messages.
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub struct PanicMsg(Cow<'static, str>); pub struct PanicMsg(Cow<'static, str>);
@ -450,17 +479,6 @@ impl Deref for PanicMsg {
} }
} }
#[derive(Debug, PartialEq, thiserror::Error)]
pub enum HtmlError {
/// Processing this HTML caused a panic in an underlying library
#[error("panicked while processing html")]
Panic(#[from] PanicMsg),
#[error("page is redirect stub for {0:?}")]
Redirect(String),
#[error("page has no text after processing")]
NoText,
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;

View file

@ -174,7 +174,7 @@ fn main() -> anyhow::Result<()> {
stdin().read_to_string(&mut input)?; stdin().read_to_string(&mut input)?;
let start = Instant::now(); let start = Instant::now();
let output = om_wikiparser::html::simplify(&input, &lang)?; let output = om_wikiparser::html::process_str(&input, &lang)?;
let stop = Instant::now(); let stop = Instant::now();
let time = stop.duration_since(start); let time = stop.duration_since(start);

View file

@ -3,14 +3,15 @@
//! To update the expected output, run the test again with the env variable //! To update the expected output, run the test again with the env variable
//! `UPDATE_EXPECT=1` set. //! `UPDATE_EXPECT=1` set.
//! See https://docs.rs/expect-test/ for more information. //! See https://docs.rs/expect-test/ for more information.
use om_wikiparser::html::{pretty_print, simplify, simplify_html, HtmlError}; use om_wikiparser::html::{detect_lang, pretty_print, process, process_str, HtmlError};
use expect_test::{expect_file, ExpectFile}; use expect_test::{expect_file, ExpectFile};
use scraper::Html; use scraper::Html;
fn check(input: &str, expect: ExpectFile) { fn check(input: &str, expect: ExpectFile) {
let mut html = Html::parse_document(input); let html = Html::parse_document(input);
simplify_html(&mut html, "en"); let lang = detect_lang(&html).unwrap();
let html = process(html, &lang).unwrap();
let processed = pretty_print(&html); let processed = pretty_print(&html);
expect.assert_eq(&processed); expect.assert_eq(&processed);
@ -35,13 +36,13 @@ fn simplify_thoor_ballylee() {
#[test] #[test]
fn not_redirect_crimean_mountains() { fn not_redirect_crimean_mountains() {
let article = include_str!("./data/Q748282-en/original.html"); let article = include_str!("./data/Q748282-en/original.html");
assert!(simplify(article, "en").is_ok()); assert!(process_str(article, "en").is_ok());
} }
#[test] #[test]
fn not_redirect_thoor_ballylee() { fn not_redirect_thoor_ballylee() {
let article = include_str!("./data/Q4185820-en/original.html"); let article = include_str!("./data/Q4185820-en/original.html");
assert!(simplify(article, "en").is_ok()); assert!(process_str(article, "en").is_ok());
} }
#[test] #[test]
@ -49,12 +50,12 @@ fn is_redirect_abdalcık_aşkale() {
let article = include_str!("./data/redirects/Abdalc%C4%B1k%2C%20A%C5%9Fkale.html"); let article = include_str!("./data/redirects/Abdalc%C4%B1k%2C%20A%C5%9Fkale.html");
assert_eq!( assert_eq!(
Err(HtmlError::Redirect("Aşkale".into())), Err(HtmlError::Redirect("Aşkale".into())),
simplify(article, "en") process_str(article, "en")
); );
} }
#[test] #[test]
fn is_empty_bahnstrecke_bassum_herford() { fn is_empty_bahnstrecke_bassum_herford() {
let article = include_str!("./data/redirects/Bahnstrecke%20Bassum%FF%FF%FFHerford.html"); let article = include_str!("./data/redirects/Bahnstrecke%20Bassum%FF%FF%FFHerford.html");
assert_eq!(Err(HtmlError::NoText), simplify(article, "en")); assert_eq!(Err(HtmlError::NoText), process_str(article, "en"));
} }