Reorganize html module
Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
b5f0b22f7a
commit
7d453d5e63
5 changed files with 49 additions and 30 deletions
|
@ -12,9 +12,9 @@ fn process_crimean_mountains(b: &mut Bencher) {
|
|||
let text = include_str!("../tests/data/Q4185820-en/original.html");
|
||||
|
||||
// process lazy statics beforehand
|
||||
black_box(html::simplify(text, "en"));
|
||||
black_box(html::process_str(text, "en").unwrap());
|
||||
|
||||
b.iter(|| {
|
||||
black_box(html::simplify(text, "en"));
|
||||
black_box(html::process_str(text, "en").unwrap());
|
||||
});
|
||||
}
|
||||
|
|
|
@ -270,8 +270,10 @@ fn write(
|
|||
redirects: impl IntoIterator<Item = Title>,
|
||||
simplify: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let html = if simplify {
|
||||
match html::simplify(&page.article_body.html, &page.in_language.identifier) {
|
||||
let html = if !simplify {
|
||||
page.article_body.html.to_string()
|
||||
} else {
|
||||
match html::process_str(&page.article_body.html, &page.in_language.identifier) {
|
||||
Ok(html) => html,
|
||||
Err(HtmlError::Panic(msg)) => {
|
||||
// Write original article text to disk
|
||||
|
@ -293,8 +295,6 @@ fn write(
|
|||
}
|
||||
Err(e) => bail!(e),
|
||||
}
|
||||
} else {
|
||||
page.article_body.html.to_string()
|
||||
};
|
||||
|
||||
let article_dir = create_article_dir(&base, page, redirects)?;
|
||||
|
|
50
src/html.rs
50
src/html.rs
|
@ -86,17 +86,24 @@ static ELEMENT_DENY_LIST: Lazy<Selector> = Lazy::new(|| {
|
|||
.unwrap()
|
||||
});
|
||||
|
||||
pub fn simplify(html: &str, lang: &str) -> Result<String, HtmlError> {
|
||||
/// Convenience wrapper around [[process]].
|
||||
pub fn process_str(html: &str, lang: &str) -> Result<String, HtmlError> {
|
||||
let document = Html::parse_document(html);
|
||||
let document = process(document, lang)?;
|
||||
Ok(document.html())
|
||||
}
|
||||
|
||||
/// Simplify an article, checking for bad pages and failures.
|
||||
pub fn process(mut document: Html, lang: &str) -> Result<Html, HtmlError> {
|
||||
panic::catch_unwind(|| {
|
||||
let mut document = Html::parse_document(html);
|
||||
if let Some(redirect) = detect_redirect(&document) {
|
||||
return Err(HtmlError::Redirect(redirect.to_owned()));
|
||||
}
|
||||
simplify_html(&mut document, lang);
|
||||
simplify(&mut document, lang);
|
||||
if !has_text(&document) {
|
||||
return Err(HtmlError::NoText);
|
||||
}
|
||||
Ok(document.html())
|
||||
Ok(document)
|
||||
})
|
||||
.map_err(PanicMsg::new)?
|
||||
}
|
||||
|
@ -144,6 +151,7 @@ pub fn detect_lang(document: &Html) -> Option<String> {
|
|||
})
|
||||
}
|
||||
|
||||
/// Check if the html contains any non-whitespace text nodes.
|
||||
pub fn has_text(document: &Html) -> bool {
|
||||
if let Some(root) = ElementRef::wrap(document.tree.root()) {
|
||||
!is_empty_or_whitespace(&root)
|
||||
|
@ -157,7 +165,16 @@ pub fn has_text(document: &Html) -> bool {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn simplify_html(document: &mut Html, lang: &str) {
|
||||
/// Simplify an article to only basic text.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// This modifies the HTML tree in a way that violates some assumptions of the underlying
|
||||
/// `scraper` and `ego-tree` crates and cause panics.
|
||||
///
|
||||
/// If this is undesirable, see [[process]] for a higher-level wrapper that
|
||||
/// handles panics and other errors.
|
||||
pub fn simplify(document: &mut Html, lang: &str) {
|
||||
if let Some(titles) = CONFIG.sections_to_remove.get(lang) {
|
||||
remove_sections(document, titles);
|
||||
}
|
||||
|
@ -419,6 +436,18 @@ fn expand_id(document: &mut Html, id: NodeId) {
|
|||
node.detach();
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, thiserror::Error)]
|
||||
pub enum HtmlError {
|
||||
/// Processing this HTML caused a panic in an underlying library
|
||||
#[error("panicked while processing html")]
|
||||
Panic(#[from] PanicMsg),
|
||||
#[error("page is redirect stub for {0:?}")]
|
||||
Redirect(String),
|
||||
#[error("page has no text after processing")]
|
||||
NoText,
|
||||
}
|
||||
|
||||
/// Error wrapper around panic payloads that handles static and formatted messages.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PanicMsg(Cow<'static, str>);
|
||||
|
||||
|
@ -450,17 +479,6 @@ impl Deref for PanicMsg {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, thiserror::Error)]
|
||||
pub enum HtmlError {
|
||||
/// Processing this HTML caused a panic in an underlying library
|
||||
#[error("panicked while processing html")]
|
||||
Panic(#[from] PanicMsg),
|
||||
#[error("page is redirect stub for {0:?}")]
|
||||
Redirect(String),
|
||||
#[error("page has no text after processing")]
|
||||
NoText,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
|
|
@ -174,7 +174,7 @@ fn main() -> anyhow::Result<()> {
|
|||
stdin().read_to_string(&mut input)?;
|
||||
|
||||
let start = Instant::now();
|
||||
let output = om_wikiparser::html::simplify(&input, &lang)?;
|
||||
let output = om_wikiparser::html::process_str(&input, &lang)?;
|
||||
let stop = Instant::now();
|
||||
let time = stop.duration_since(start);
|
||||
|
||||
|
|
|
@ -3,14 +3,15 @@
|
|||
//! To update the expected output, run the test again with the env variable
|
||||
//! `UPDATE_EXPECT=1` set.
|
||||
//! See https://docs.rs/expect-test/ for more information.
|
||||
use om_wikiparser::html::{pretty_print, simplify, simplify_html, HtmlError};
|
||||
use om_wikiparser::html::{detect_lang, pretty_print, process, process_str, HtmlError};
|
||||
|
||||
use expect_test::{expect_file, ExpectFile};
|
||||
use scraper::Html;
|
||||
|
||||
fn check(input: &str, expect: ExpectFile) {
|
||||
let mut html = Html::parse_document(input);
|
||||
simplify_html(&mut html, "en");
|
||||
let html = Html::parse_document(input);
|
||||
let lang = detect_lang(&html).unwrap();
|
||||
let html = process(html, &lang).unwrap();
|
||||
let processed = pretty_print(&html);
|
||||
|
||||
expect.assert_eq(&processed);
|
||||
|
@ -35,13 +36,13 @@ fn simplify_thoor_ballylee() {
|
|||
#[test]
|
||||
fn not_redirect_crimean_mountains() {
|
||||
let article = include_str!("./data/Q748282-en/original.html");
|
||||
assert!(simplify(article, "en").is_ok());
|
||||
assert!(process_str(article, "en").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn not_redirect_thoor_ballylee() {
|
||||
let article = include_str!("./data/Q4185820-en/original.html");
|
||||
assert!(simplify(article, "en").is_ok());
|
||||
assert!(process_str(article, "en").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -49,12 +50,12 @@ fn is_redirect_abdalcık_aşkale() {
|
|||
let article = include_str!("./data/redirects/Abdalc%C4%B1k%2C%20A%C5%9Fkale.html");
|
||||
assert_eq!(
|
||||
Err(HtmlError::Redirect("Aşkale".into())),
|
||||
simplify(article, "en")
|
||||
process_str(article, "en")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_empty_bahnstrecke_bassum_herford() {
|
||||
let article = include_str!("./data/redirects/Bahnstrecke%20Bassum%FF%FF%FFHerford.html");
|
||||
assert_eq!(Err(HtmlError::NoText), simplify(article, "en"));
|
||||
assert_eq!(Err(HtmlError::NoText), process_str(article, "en"));
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue