Handle simplification panics
I've tested manually and it: - handles panics with a static message or formatted arguments - logs an error instead of exiting (backtraces are still printed) - writes any panic-causing html to an `errors/` subdirectory Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
3de06a3209
commit
33174511dd
3 changed files with 77 additions and 9 deletions
|
@ -8,7 +8,7 @@ use std::{
|
|||
use anyhow::{anyhow, bail, Context};
|
||||
|
||||
use om_wikiparser::{
|
||||
html,
|
||||
html::{self, HtmlError},
|
||||
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, Title},
|
||||
};
|
||||
|
||||
|
@ -269,7 +269,7 @@ fn write(
|
|||
redirects: impl IntoIterator<Item = Title>,
|
||||
simplify: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let article_dir = create_article_dir(base, page, redirects)?;
|
||||
let article_dir = create_article_dir(&base, page, redirects)?;
|
||||
|
||||
// Write html to determined file.
|
||||
let mut filename = article_dir;
|
||||
|
@ -283,7 +283,27 @@ fn write(
|
|||
}
|
||||
|
||||
let html = if simplify {
|
||||
html::simplify(&page.article_body.html, &page.in_language.identifier)
|
||||
match html::simplify(&page.article_body.html, &page.in_language.identifier) {
|
||||
Ok(html) => html,
|
||||
Err(HtmlError::Panic(msg)) => {
|
||||
// Write original article text to disk
|
||||
let mut error_file = base.as_ref().to_path_buf();
|
||||
error_file.push("errors");
|
||||
if !error_file.exists() {
|
||||
fs::create_dir(&error_file).context("creating error directory")?;
|
||||
}
|
||||
error_file.push(page.name.replace('/', "%2F"));
|
||||
error_file.set_extension("html");
|
||||
|
||||
fs::write(&error_file, &page.article_body.html).context("writing error file")?;
|
||||
|
||||
if !msg.is_empty() {
|
||||
bail!("panic occurred while processing html (saved to {error_file:?}): {msg}");
|
||||
} else {
|
||||
bail!("panic occurred while processing html (saved to {error_file:?})");
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
page.article_body.html.to_string()
|
||||
};
|
||||
|
|
58
src/html.rs
58
src/html.rs
|
@ -1,4 +1,11 @@
|
|||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::{
|
||||
any::Any,
|
||||
borrow::Cow,
|
||||
collections::{BTreeMap, BTreeSet},
|
||||
fmt::Display,
|
||||
ops::Deref,
|
||||
panic,
|
||||
};
|
||||
|
||||
use ego_tree::NodeId;
|
||||
use markup5ever::{LocalName, Namespace, QualName};
|
||||
|
@ -78,10 +85,13 @@ static ELEMENT_DENY_LIST: Lazy<Selector> = Lazy::new(|| {
|
|||
.unwrap()
|
||||
});
|
||||
|
||||
pub fn simplify(html: &str, lang: &str) -> String {
|
||||
let mut document = Html::parse_document(html);
|
||||
simplify_html(&mut document, lang);
|
||||
document.html()
|
||||
pub fn simplify(html: &str, lang: &str) -> Result<String, HtmlError> {
|
||||
panic::catch_unwind(|| {
|
||||
let mut document = Html::parse_document(html);
|
||||
simplify_html(&mut document, lang);
|
||||
Ok(document.html())
|
||||
})
|
||||
.map_err(PanicMsg::new)?
|
||||
}
|
||||
|
||||
pub fn simplify_html(document: &mut Html, lang: &str) {
|
||||
|
@ -329,6 +339,44 @@ fn expand_id(document: &mut Html, id: NodeId) {
|
|||
node.detach();
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PanicMsg(Cow<'static, str>);
|
||||
|
||||
impl PanicMsg {
|
||||
pub fn new(payload: Box<dyn Any + Send + 'static>) -> Self {
|
||||
let msg = if let Some(s) = payload.downcast_ref::<&str>() {
|
||||
Some(Cow::Borrowed(*s))
|
||||
} else {
|
||||
payload.downcast::<String>().ok().map(|s| Cow::Owned(*s))
|
||||
};
|
||||
|
||||
Self(msg.unwrap_or_default())
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for PanicMsg {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for PanicMsg {}
|
||||
|
||||
impl Deref for PanicMsg {
|
||||
type Target = str;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum HtmlError {
|
||||
/// Processing this HTML caused a panic in an underlying library
|
||||
#[error("panicked while processing html")]
|
||||
Panic(#[from] PanicMsg),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
|
|
@ -102,7 +102,7 @@ fn main() -> anyhow::Result<()> {
|
|||
stdin().read_to_string(&mut input)?;
|
||||
|
||||
let start = Instant::now();
|
||||
let output = om_wikiparser::html::simplify(&input, &lang);
|
||||
let output = om_wikiparser::html::simplify(&input, &lang)?;
|
||||
let stop = Instant::now();
|
||||
let time = stop.duration_since(start);
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue