Reorganize html module

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-10-04 16:20:50 -04:00 committed by Evan Lloyd New-Schmidt
parent b5f0b22f7a
commit 7d453d5e63
5 changed files with 49 additions and 30 deletions

View file

@ -12,9 +12,9 @@ fn process_crimean_mountains(b: &mut Bencher) {
let text = include_str!("../tests/data/Q4185820-en/original.html");
// process lazy statics beforehand
black_box(html::simplify(text, "en"));
black_box(html::process_str(text, "en").unwrap());
b.iter(|| {
black_box(html::simplify(text, "en"));
black_box(html::process_str(text, "en").unwrap());
});
}

View file

@ -270,8 +270,10 @@ fn write(
redirects: impl IntoIterator<Item = Title>,
simplify: bool,
) -> anyhow::Result<()> {
let html = if simplify {
match html::simplify(&page.article_body.html, &page.in_language.identifier) {
let html = if !simplify {
page.article_body.html.to_string()
} else {
match html::process_str(&page.article_body.html, &page.in_language.identifier) {
Ok(html) => html,
Err(HtmlError::Panic(msg)) => {
// Write original article text to disk
@ -293,8 +295,6 @@ fn write(
}
Err(e) => bail!(e),
}
} else {
page.article_body.html.to_string()
};
let article_dir = create_article_dir(&base, page, redirects)?;

View file

@ -86,17 +86,24 @@ static ELEMENT_DENY_LIST: Lazy<Selector> = Lazy::new(|| {
.unwrap()
});
pub fn simplify(html: &str, lang: &str) -> Result<String, HtmlError> {
/// Convenience wrapper around [[process]].
pub fn process_str(html: &str, lang: &str) -> Result<String, HtmlError> {
let document = Html::parse_document(html);
let document = process(document, lang)?;
Ok(document.html())
}
/// Simplify an article, checking for bad pages and failures.
pub fn process(mut document: Html, lang: &str) -> Result<Html, HtmlError> {
panic::catch_unwind(|| {
let mut document = Html::parse_document(html);
if let Some(redirect) = detect_redirect(&document) {
return Err(HtmlError::Redirect(redirect.to_owned()));
}
simplify_html(&mut document, lang);
simplify(&mut document, lang);
if !has_text(&document) {
return Err(HtmlError::NoText);
}
Ok(document.html())
Ok(document)
})
.map_err(PanicMsg::new)?
}
@ -144,6 +151,7 @@ pub fn detect_lang(document: &Html) -> Option<String> {
})
}
/// Check if the html contains any non-whitespace text nodes.
pub fn has_text(document: &Html) -> bool {
if let Some(root) = ElementRef::wrap(document.tree.root()) {
!is_empty_or_whitespace(&root)
@ -157,7 +165,16 @@ pub fn has_text(document: &Html) -> bool {
}
}
pub fn simplify_html(document: &mut Html, lang: &str) {
/// Simplify an article to only basic text.
///
/// # Panics
///
/// This modifies the HTML tree in a way that violates some assumptions of the underlying
/// `scraper` and `ego-tree` crates and cause panics.
///
/// If this is undesirable, see [[process]] for a higher-level wrapper that
/// handles panics and other errors.
pub fn simplify(document: &mut Html, lang: &str) {
if let Some(titles) = CONFIG.sections_to_remove.get(lang) {
remove_sections(document, titles);
}
@ -419,6 +436,18 @@ fn expand_id(document: &mut Html, id: NodeId) {
node.detach();
}
#[derive(Debug, PartialEq, thiserror::Error)]
pub enum HtmlError {
/// Processing this HTML caused a panic in an underlying library
#[error("panicked while processing html")]
Panic(#[from] PanicMsg),
#[error("page is redirect stub for {0:?}")]
Redirect(String),
#[error("page has no text after processing")]
NoText,
}
/// Error wrapper around panic payloads that handles static and formatted messages.
#[derive(Debug, PartialEq)]
pub struct PanicMsg(Cow<'static, str>);
@ -450,17 +479,6 @@ impl Deref for PanicMsg {
}
}
#[derive(Debug, PartialEq, thiserror::Error)]
pub enum HtmlError {
/// Processing this HTML caused a panic in an underlying library
#[error("panicked while processing html")]
Panic(#[from] PanicMsg),
#[error("page is redirect stub for {0:?}")]
Redirect(String),
#[error("page has no text after processing")]
NoText,
}
#[cfg(test)]
mod test {
use super::*;

View file

@ -174,7 +174,7 @@ fn main() -> anyhow::Result<()> {
stdin().read_to_string(&mut input)?;
let start = Instant::now();
let output = om_wikiparser::html::simplify(&input, &lang)?;
let output = om_wikiparser::html::process_str(&input, &lang)?;
let stop = Instant::now();
let time = stop.duration_since(start);

View file

@ -3,14 +3,15 @@
//! To update the expected output, run the test again with the env variable
//! `UPDATE_EXPECT=1` set.
//! See https://docs.rs/expect-test/ for more information.
use om_wikiparser::html::{pretty_print, simplify, simplify_html, HtmlError};
use om_wikiparser::html::{detect_lang, pretty_print, process, process_str, HtmlError};
use expect_test::{expect_file, ExpectFile};
use scraper::Html;
fn check(input: &str, expect: ExpectFile) {
let mut html = Html::parse_document(input);
simplify_html(&mut html, "en");
let html = Html::parse_document(input);
let lang = detect_lang(&html).unwrap();
let html = process(html, &lang).unwrap();
let processed = pretty_print(&html);
expect.assert_eq(&processed);
@ -35,13 +36,13 @@ fn simplify_thoor_ballylee() {
#[test]
fn not_redirect_crimean_mountains() {
let article = include_str!("./data/Q748282-en/original.html");
assert!(simplify(article, "en").is_ok());
assert!(process_str(article, "en").is_ok());
}
#[test]
fn not_redirect_thoor_ballylee() {
let article = include_str!("./data/Q4185820-en/original.html");
assert!(simplify(article, "en").is_ok());
assert!(process_str(article, "en").is_ok());
}
#[test]
@ -49,12 +50,12 @@ fn is_redirect_abdalcık_aşkale() {
let article = include_str!("./data/redirects/Abdalc%C4%B1k%2C%20A%C5%9Fkale.html");
assert_eq!(
Err(HtmlError::Redirect("Aşkale".into())),
simplify(article, "en")
process_str(article, "en")
);
}
#[test]
fn is_empty_bahnstrecke_bassum_herford() {
let article = include_str!("./data/redirects/Bahnstrecke%20Bassum%FF%FF%FFHerford.html");
assert_eq!(Err(HtmlError::NoText), simplify(article, "en"));
assert_eq!(Err(HtmlError::NoText), process_str(article, "en"));
}