diff --git a/src/get_articles.rs b/src/get_articles.rs index 57e7526..0ece40a 100644 --- a/src/get_articles.rs +++ b/src/get_articles.rs @@ -8,7 +8,7 @@ use std::{ use anyhow::{anyhow, bail, Context}; use om_wikiparser::{ - html, + html::{self, HtmlError}, wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, Title}, }; @@ -269,7 +269,7 @@ fn write( redirects: impl IntoIterator, simplify: bool, ) -> anyhow::Result<()> { - let article_dir = create_article_dir(base, page, redirects)?; + let article_dir = create_article_dir(&base, page, redirects)?; // Write html to determined file. let mut filename = article_dir; @@ -283,7 +283,27 @@ fn write( } let html = if simplify { - html::simplify(&page.article_body.html, &page.in_language.identifier) + match html::simplify(&page.article_body.html, &page.in_language.identifier) { + Ok(html) => html, + Err(HtmlError::Panic(msg)) => { + // Write original article text to disk + let mut error_file = base.as_ref().to_path_buf(); + error_file.push("errors"); + if !error_file.exists() { + fs::create_dir(&error_file).context("creating error directory")?; + } + error_file.push(page.name.replace('/', "%2F")); + error_file.set_extension("html"); + + fs::write(&error_file, &page.article_body.html).context("writing error file")?; + + if !msg.is_empty() { + bail!("panic occurred while processing html (saved to {error_file:?}): {msg}"); + } else { + bail!("panic occurred while processing html (saved to {error_file:?})"); + } + } + } } else { page.article_body.html.to_string() }; diff --git a/src/html.rs b/src/html.rs index f8a219f..78fba3d 100644 --- a/src/html.rs +++ b/src/html.rs @@ -1,4 +1,11 @@ -use std::collections::{BTreeMap, BTreeSet}; +use std::{ + any::Any, + borrow::Cow, + collections::{BTreeMap, BTreeSet}, + fmt::Display, + ops::Deref, + panic, +}; use ego_tree::NodeId; use markup5ever::{LocalName, Namespace, QualName}; @@ -78,10 +85,13 @@ static ELEMENT_DENY_LIST: Lazy = Lazy::new(|| { .unwrap() }); -pub fn simplify(html: &str, lang: &str) -> String { - let mut document = Html::parse_document(html); - simplify_html(&mut document, lang); - document.html() +pub fn simplify(html: &str, lang: &str) -> Result { + panic::catch_unwind(|| { + let mut document = Html::parse_document(html); + simplify_html(&mut document, lang); + Ok(document.html()) + }) + .map_err(PanicMsg::new)? } pub fn simplify_html(document: &mut Html, lang: &str) { @@ -329,6 +339,44 @@ fn expand_id(document: &mut Html, id: NodeId) { node.detach(); } +#[derive(Debug)] +pub struct PanicMsg(Cow<'static, str>); + +impl PanicMsg { + pub fn new(payload: Box) -> Self { + let msg = if let Some(s) = payload.downcast_ref::<&str>() { + Some(Cow::Borrowed(*s)) + } else { + payload.downcast::().ok().map(|s| Cow::Owned(*s)) + }; + + Self(msg.unwrap_or_default()) + } +} + +impl Display for PanicMsg { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl std::error::Error for PanicMsg {} + +impl Deref for PanicMsg { + type Target = str; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +#[derive(Debug, thiserror::Error)] +pub enum HtmlError { + /// Processing this HTML caused a panic in an underlying library + #[error("panicked while processing html")] + Panic(#[from] PanicMsg), +} + #[cfg(test)] mod test { use super::*; diff --git a/src/main.rs b/src/main.rs index 6a26672..e88ef62 100644 --- a/src/main.rs +++ b/src/main.rs @@ -102,7 +102,7 @@ fn main() -> anyhow::Result<()> { stdin().read_to_string(&mut input)?; let start = Instant::now(); - let output = om_wikiparser::html::simplify(&input, &lang); + let output = om_wikiparser::html::simplify(&input, &lang)?; let stop = Instant::now(); let time = stop.duration_since(start);