Handle simplification panics

I've tested manually and it:
- handles panics with a static message or formatted arguments
- logs an error instead of exiting (backtraces are still printed)
- writes any panic-causing html to an `errors/` subdirectory

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-09-03 22:29:52 -04:00 committed by Evan Lloyd New-Schmidt
parent 3de06a3209
commit 33174511dd
3 changed files with 77 additions and 9 deletions

View file

@ -8,7 +8,7 @@ use std::{
use anyhow::{anyhow, bail, Context};
use om_wikiparser::{
html,
html::{self, HtmlError},
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, Title},
};
@ -269,7 +269,7 @@ fn write(
redirects: impl IntoIterator<Item = Title>,
simplify: bool,
) -> anyhow::Result<()> {
let article_dir = create_article_dir(base, page, redirects)?;
let article_dir = create_article_dir(&base, page, redirects)?;
// Write html to determined file.
let mut filename = article_dir;
@ -283,7 +283,27 @@ fn write(
}
let html = if simplify {
html::simplify(&page.article_body.html, &page.in_language.identifier)
match html::simplify(&page.article_body.html, &page.in_language.identifier) {
Ok(html) => html,
Err(HtmlError::Panic(msg)) => {
// Write original article text to disk
let mut error_file = base.as_ref().to_path_buf();
error_file.push("errors");
if !error_file.exists() {
fs::create_dir(&error_file).context("creating error directory")?;
}
error_file.push(page.name.replace('/', "%2F"));
error_file.set_extension("html");
fs::write(&error_file, &page.article_body.html).context("writing error file")?;
if !msg.is_empty() {
bail!("panic occurred while processing html (saved to {error_file:?}): {msg}");
} else {
bail!("panic occurred while processing html (saved to {error_file:?})");
}
}
}
} else {
page.article_body.html.to_string()
};

View file

@ -1,4 +1,11 @@
use std::collections::{BTreeMap, BTreeSet};
use std::{
any::Any,
borrow::Cow,
collections::{BTreeMap, BTreeSet},
fmt::Display,
ops::Deref,
panic,
};
use ego_tree::NodeId;
use markup5ever::{LocalName, Namespace, QualName};
@ -78,10 +85,13 @@ static ELEMENT_DENY_LIST: Lazy<Selector> = Lazy::new(|| {
.unwrap()
});
pub fn simplify(html: &str, lang: &str) -> String {
let mut document = Html::parse_document(html);
simplify_html(&mut document, lang);
document.html()
pub fn simplify(html: &str, lang: &str) -> Result<String, HtmlError> {
panic::catch_unwind(|| {
let mut document = Html::parse_document(html);
simplify_html(&mut document, lang);
Ok(document.html())
})
.map_err(PanicMsg::new)?
}
pub fn simplify_html(document: &mut Html, lang: &str) {
@ -329,6 +339,44 @@ fn expand_id(document: &mut Html, id: NodeId) {
node.detach();
}
#[derive(Debug)]
pub struct PanicMsg(Cow<'static, str>);
impl PanicMsg {
pub fn new(payload: Box<dyn Any + Send + 'static>) -> Self {
let msg = if let Some(s) = payload.downcast_ref::<&str>() {
Some(Cow::Borrowed(*s))
} else {
payload.downcast::<String>().ok().map(|s| Cow::Owned(*s))
};
Self(msg.unwrap_or_default())
}
}
impl Display for PanicMsg {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl std::error::Error for PanicMsg {}
impl Deref for PanicMsg {
type Target = str;
fn deref(&self) -> &Self::Target {
&self.0
}
}
#[derive(Debug, thiserror::Error)]
pub enum HtmlError {
/// Processing this HTML caused a panic in an underlying library
#[error("panicked while processing html")]
Panic(#[from] PanicMsg),
}
#[cfg(test)]
mod test {
use super::*;

View file

@ -102,7 +102,7 @@ fn main() -> anyhow::Result<()> {
stdin().read_to_string(&mut input)?;
let start = Instant::now();
let output = om_wikiparser::html::simplify(&input, &lang);
let output = om_wikiparser::html::simplify(&input, &lang)?;
let stop = Instant::now();
let time = stop.duration_since(start);