diff --git a/benches/html_processing.rs b/benches/html_processing.rs
index 0e4f3e0..1d5b73e 100644
--- a/benches/html_processing.rs
+++ b/benches/html_processing.rs
@@ -12,9 +12,9 @@ fn process_crimean_mountains(b: &mut Bencher) {
let text = include_str!("../tests/data/Q4185820-en/original.html");
// process lazy statics beforehand
- black_box(html::simplify(text, "en"));
+ black_box(html::process_str(text, "en").unwrap());
b.iter(|| {
- black_box(html::simplify(text, "en"));
+ black_box(html::process_str(text, "en").unwrap());
});
}
diff --git a/src/get_articles.rs b/src/get_articles.rs
index ab67b2f..b77bcad 100644
--- a/src/get_articles.rs
+++ b/src/get_articles.rs
@@ -270,8 +270,10 @@ fn write(
redirects: impl IntoIterator- ,
simplify: bool,
) -> anyhow::Result<()> {
- let html = if simplify {
- match html::simplify(&page.article_body.html, &page.in_language.identifier) {
+ let html = if !simplify {
+ page.article_body.html.to_string()
+ } else {
+ match html::process_str(&page.article_body.html, &page.in_language.identifier) {
Ok(html) => html,
Err(HtmlError::Panic(msg)) => {
// Write original article text to disk
@@ -293,8 +295,6 @@ fn write(
}
Err(e) => bail!(e),
}
- } else {
- page.article_body.html.to_string()
};
let article_dir = create_article_dir(&base, page, redirects)?;
diff --git a/src/html.rs b/src/html.rs
index 4e3853f..4315a72 100644
--- a/src/html.rs
+++ b/src/html.rs
@@ -86,17 +86,24 @@ static ELEMENT_DENY_LIST: Lazy = Lazy::new(|| {
.unwrap()
});
-pub fn simplify(html: &str, lang: &str) -> Result {
+/// Convenience wrapper around [[process]].
+pub fn process_str(html: &str, lang: &str) -> Result {
+ let document = Html::parse_document(html);
+ let document = process(document, lang)?;
+ Ok(document.html())
+}
+
+/// Simplify an article, checking for bad pages and failures.
+pub fn process(mut document: Html, lang: &str) -> Result {
panic::catch_unwind(|| {
- let mut document = Html::parse_document(html);
if let Some(redirect) = detect_redirect(&document) {
return Err(HtmlError::Redirect(redirect.to_owned()));
}
- simplify_html(&mut document, lang);
+ simplify(&mut document, lang);
if !has_text(&document) {
return Err(HtmlError::NoText);
}
- Ok(document.html())
+ Ok(document)
})
.map_err(PanicMsg::new)?
}
@@ -144,6 +151,7 @@ pub fn detect_lang(document: &Html) -> Option {
})
}
+/// Check if the html contains any non-whitespace text nodes.
pub fn has_text(document: &Html) -> bool {
if let Some(root) = ElementRef::wrap(document.tree.root()) {
!is_empty_or_whitespace(&root)
@@ -157,7 +165,16 @@ pub fn has_text(document: &Html) -> bool {
}
}
-pub fn simplify_html(document: &mut Html, lang: &str) {
+/// Simplify an article to only basic text.
+///
+/// # Panics
+///
+/// This modifies the HTML tree in a way that violates some assumptions of the underlying
+/// `scraper` and `ego-tree` crates and cause panics.
+///
+/// If this is undesirable, see [[process]] for a higher-level wrapper that
+/// handles panics and other errors.
+pub fn simplify(document: &mut Html, lang: &str) {
if let Some(titles) = CONFIG.sections_to_remove.get(lang) {
remove_sections(document, titles);
}
@@ -419,6 +436,18 @@ fn expand_id(document: &mut Html, id: NodeId) {
node.detach();
}
+#[derive(Debug, PartialEq, thiserror::Error)]
+pub enum HtmlError {
+ /// Processing this HTML caused a panic in an underlying library
+ #[error("panicked while processing html")]
+ Panic(#[from] PanicMsg),
+ #[error("page is redirect stub for {0:?}")]
+ Redirect(String),
+ #[error("page has no text after processing")]
+ NoText,
+}
+
+/// Error wrapper around panic payloads that handles static and formatted messages.
#[derive(Debug, PartialEq)]
pub struct PanicMsg(Cow<'static, str>);
@@ -450,17 +479,6 @@ impl Deref for PanicMsg {
}
}
-#[derive(Debug, PartialEq, thiserror::Error)]
-pub enum HtmlError {
- /// Processing this HTML caused a panic in an underlying library
- #[error("panicked while processing html")]
- Panic(#[from] PanicMsg),
- #[error("page is redirect stub for {0:?}")]
- Redirect(String),
- #[error("page has no text after processing")]
- NoText,
-}
-
#[cfg(test)]
mod test {
use super::*;
diff --git a/src/main.rs b/src/main.rs
index b28d7fb..7286ee1 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -174,7 +174,7 @@ fn main() -> anyhow::Result<()> {
stdin().read_to_string(&mut input)?;
let start = Instant::now();
- let output = om_wikiparser::html::simplify(&input, &lang)?;
+ let output = om_wikiparser::html::process_str(&input, &lang)?;
let stop = Instant::now();
let time = stop.duration_since(start);
diff --git a/tests/html.rs b/tests/html.rs
index 27b1e48..fdef3f5 100644
--- a/tests/html.rs
+++ b/tests/html.rs
@@ -3,14 +3,15 @@
//! To update the expected output, run the test again with the env variable
//! `UPDATE_EXPECT=1` set.
//! See https://docs.rs/expect-test/ for more information.
-use om_wikiparser::html::{pretty_print, simplify, simplify_html, HtmlError};
+use om_wikiparser::html::{detect_lang, pretty_print, process, process_str, HtmlError};
use expect_test::{expect_file, ExpectFile};
use scraper::Html;
fn check(input: &str, expect: ExpectFile) {
- let mut html = Html::parse_document(input);
- simplify_html(&mut html, "en");
+ let html = Html::parse_document(input);
+ let lang = detect_lang(&html).unwrap();
+ let html = process(html, &lang).unwrap();
let processed = pretty_print(&html);
expect.assert_eq(&processed);
@@ -35,13 +36,13 @@ fn simplify_thoor_ballylee() {
#[test]
fn not_redirect_crimean_mountains() {
let article = include_str!("./data/Q748282-en/original.html");
- assert!(simplify(article, "en").is_ok());
+ assert!(process_str(article, "en").is_ok());
}
#[test]
fn not_redirect_thoor_ballylee() {
let article = include_str!("./data/Q4185820-en/original.html");
- assert!(simplify(article, "en").is_ok());
+ assert!(process_str(article, "en").is_ok());
}
#[test]
@@ -49,12 +50,12 @@ fn is_redirect_abdalcık_aşkale() {
let article = include_str!("./data/redirects/Abdalc%C4%B1k%2C%20A%C5%9Fkale.html");
assert_eq!(
Err(HtmlError::Redirect("Aşkale".into())),
- simplify(article, "en")
+ process_str(article, "en")
);
}
#[test]
fn is_empty_bahnstrecke_bassum_herford() {
let article = include_str!("./data/redirects/Bahnstrecke%20Bassum%FF%FF%FFHerford.html");
- assert_eq!(Err(HtmlError::NoText), simplify(article, "en"));
+ assert_eq!(Err(HtmlError::NoText), process_str(article, "en"));
}