Add checks for article redirects, empty articles, and sniff language

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-09-26 14:55:19 -04:00 committed by Evan Lloyd New-Schmidt
parent e31027893e
commit 1da8ec212a
5 changed files with 106 additions and 3 deletions

View file

@ -304,6 +304,7 @@ fn write(
bail!("panic occurred while processing html (saved to {error_file:?})");
}
}
Err(e) => bail!(e),
}
} else {
page.article_body.html.to_string()

View file

@ -15,6 +15,7 @@ use serde::Deserialize;
mod pretty;
pub use pretty::pretty_print;
use url::Url;
#[derive(Debug, Deserialize)]
struct Config<'a> {
@ -88,12 +89,74 @@ static ELEMENT_DENY_LIST: Lazy<Selector> = Lazy::new(|| {
pub fn simplify(html: &str, lang: &str) -> Result<String, HtmlError> {
panic::catch_unwind(|| {
let mut document = Html::parse_document(html);
if let Some(redirect) = detect_redirect(&document) {
return Err(HtmlError::Redirect(redirect.to_owned()));
}
simplify_html(&mut document, lang);
if !has_text(&document) {
return Err(HtmlError::NoText);
}
Ok(document.html())
})
.map_err(PanicMsg::new)?
}
/// Attempt to find target title of the article if it is a redirect.
pub fn detect_redirect(document: &Html) -> Option<&str> {
static REDIRECT: Lazy<Selector> =
Lazy::new(|| Selector::parse(r#"link[rel="mw:PageProp/redirect"]"#).unwrap());
document.select(&REDIRECT).next().map(|el| {
let href = el.value().attr("href").unwrap_or_default().trim();
let redirect = href.strip_prefix("./").unwrap_or(href);
redirect
})
}
/// Attempt to find the wikipedia language of the article.
pub fn detect_lang(document: &Html) -> Option<String> {
static BASE: Lazy<Selector> = Lazy::new(|| Selector::parse("head > base[href]").unwrap());
document
.select(&BASE)
.next()
.and_then(|el| el.value().attr("href"))
.and_then(|url| {
let mut url = url.to_owned();
if url.starts_with("//") {
url.insert_str(0, "http:");
}
match Url::parse(&url) {
Err(e) => {
trace!("Error parsing base lang url: {}", e);
None
}
Ok(url) => {
let domain = url.domain()?;
let (lang, domain) = domain.split_once('.')?;
if domain != "wikipedia.org" {
trace!("Domain of base lang url is not wikipedia.org: {}", domain);
}
Some(lang.to_owned())
}
}
})
}
pub fn has_text(document: &Html) -> bool {
if let Some(root) = ElementRef::wrap(document.tree.root()) {
!is_empty_or_whitespace(&root)
} else {
!document
.tree
.root()
.children()
.filter_map(ElementRef::wrap)
.all(|el| is_empty_or_whitespace(&el))
}
}
pub fn simplify_html(document: &mut Html, lang: &str) {
if let Some(titles) = CONFIG.sections_to_remove.get(lang) {
remove_sections(document, titles);
@ -339,7 +402,7 @@ fn expand_id(document: &mut Html, id: NodeId) {
node.detach();
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub struct PanicMsg(Cow<'static, str>);
impl PanicMsg {
@ -370,11 +433,15 @@ impl Deref for PanicMsg {
}
}
#[derive(Debug, thiserror::Error)]
#[derive(Debug, PartialEq, thiserror::Error)]
pub enum HtmlError {
/// Processing this HTML caused a panic in an underlying library
#[error("panicked while processing html")]
Panic(#[from] PanicMsg),
#[error("page is redirect stub for {0:?}")]
Redirect(String),
#[error("page has no text after processing")]
NoText,
}
#[cfg(test)]

View file

@ -0,0 +1,2 @@
<!DOCTYPE html>
<html prefix="dc: http://purl.org/dc/terms/ mw: http://mediawiki.org/rdf/" about="https://en.wikipedia.org/wiki/Special:Redirect/revision/1018506873"><head prefix="mwr: https://en.wikipedia.org/wiki/Special:Redirect/"><meta property="mw:TimeUuid" content="d2663280-0a92-11ee-9251-69f636c11cde"/><meta charset="utf-8"/><meta property="mw:pageId" content="67162846"/><meta property="mw:pageNamespace" content="0"/><link rel="dc:replaces" resource="mwr:revision/1015759684"/><meta property="mw:revisionSHA1" content="efbb4024f27618d5a8547d1fc093c16f7de0129f"/><meta property="dc:modified" content="2021-04-18T12:27:23.000Z"/><meta property="mw:htmlVersion" content="2.8.0"/><meta property="mw:html:version" content="2.8.0"/><link rel="dc:isVersionOf" href="//en.wikipedia.org/wiki/Abdalc%C4%B1k%2C_A%C5%9Fkale"/><base href="//en.wikipedia.org/wiki/"/><title>Abdalcık, Aşkale</title><link rel="stylesheet" href="/w/load.php?lang=en&amp;modules=mediawiki.skinning.content.parsoid%7Cmediawiki.skinning.interface%7Csite.styles&amp;only=styles&amp;skin=vector"/><meta http-equiv="content-language" content="en"/><meta http-equiv="vary" content="Accept"/></head><body id="mwAA" lang="en" class="mw-content-ltr sitedir-ltr ltr mw-body-content parsoid-body mediawiki mw-parser-output" dir="ltr"><section data-mw-section-id="0" id="mwAQ"><link rel="mw:PageProp/redirect" href="./Aşkale" id="mwAg"/></section></body></html>

View file

@ -0,0 +1,6 @@
<!DOCTYPE html>
<html prefix="dc: http://purl.org/dc/terms/ mw: http://mediawiki.org/rdf/" about="https://de.wikipedia.org/wiki/Special:Redirect/revision/216559427"><head prefix="mwr: https://de.wikipedia.org/wiki/Special:Redirect/"><meta property="mw:TimeUuid" content="82157de0-c4d8-11ed-90e2-252585de4b86"/><meta charset="utf-8"/><meta property="mw:pageId" content="11692067"/><meta property="mw:pageNamespace" content="0"/><link rel="dc:replaces" resource="mwr:revision/209688171"/><meta property="mw:revisionSHA1" content="490f5886f51d04c0259915fab3d5e09681c95ac7"/><meta property="dc:modified" content="2021-10-21T14:42:03.000Z"/><meta property="mw:htmlVersion" content="2.7.0"/><meta property="mw:html:version" content="2.7.0"/><link rel="dc:isVersionOf" href="//de.wikipedia.org/wiki/Bahnstrecke_Bassum%E2%80%93Herford"/><base href="//de.wikipedia.org/wiki/"/><title>Bahnstrecke BassumHerford</title><link rel="stylesheet" href="/w/load.php?lang=de&amp;modules=mediawiki.skinning.content.parsoid%7Cmediawiki.skinning.interface%7Csite.styles&amp;only=styles&amp;skin=vector"/><meta http-equiv="content-language" content="de"/><meta http-equiv="vary" content="Accept"/></head><body id="mwAA" lang="de" class="mw-content-ltr sitedir-ltr ltr mw-body-content parsoid-body mediawiki mw-parser-output" dir="ltr"><section data-mw-section-id="0" id="mwAQ"><div class="hintergrundfarbe1 rahmenfarbe1 navigation-not-searchable navigation-not-searchable" style="border-style: solid; border-width: 1px; clear: left; margin-bottom:1em; margin-top:1em; padding: 0.25em; overflow: hidden; word-break: break-word; word-wrap: break-word; " id="Vorlage_Falschschreibung" about="#mwt1" typeof="mw:Transclusion" data-mw='{"parts":[{"template":{"target":{"wt":"Falschschreibung","href":"./Vorlage:Falschschreibung"},"params":{"Alternativtext":{"wt":"[[Bahnstrecke BündeBassum]] oder [[Bahnstrecke HerfordKirchlengern]]"}},"i":0}}]}'><div class="noviewer nomobile" style="display: table-cell; padding-bottom: 0.2em; padding-left: 0.25em; padding-right: 1em; padding-top: 0.2em; vertical-align: middle;" aria-hidden="true" role="presentation"><span typeof="mw:File" data-mw='{"caption":"Falschschreibung"}'><a href="./Datei:Gtk-dialog-info.svg" class="mw-file-description" title="Falschschreibung"><img alt="" resource="./Datei:Gtk-dialog-info.svg" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b4/Gtk-dialog-info.svg/25px-Gtk-dialog-info.svg.png" decoding="async" data-file-width="60" data-file-height="60" data-file-type="drawing" height="25" width="25" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b4/Gtk-dialog-info.svg/38px-Gtk-dialog-info.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/b4/Gtk-dialog-info.svg/50px-Gtk-dialog-info.svg.png 2x"/></a></span></div>
<div style="display: table-cell; vertical-align: middle; width: 100%;">
<div role="navigation">
Zu diesem Stichwort ist kein Artikel vorhanden; möglicherweise ist <a rel="mw:WikiLink" href="./Bahnstrecke_BündeBassum" title="Bahnstrecke BündeBassum">Bahnstrecke BündeBassum</a> oder <a rel="mw:WikiLink" href="./Bahnstrecke_HerfordKirchlengern" title="Bahnstrecke HerfordKirchlengern">Bahnstrecke HerfordKirchlengern</a> gemeint.</div>
</div></div><link rel="mw:PageProp/Category" href="./Kategorie:Wikipedia:Falschschreibung" about="#mwt1"/><meta property="mw:PageProp/disambiguation" about="#mwt1"/><span about="#mwt1"> </span><meta property="mw:PageProp/expectedUnconnectedPage" about="#mwt1"/></section></body></html>

View file

@ -3,7 +3,7 @@
//! To update the expected output, run the test again with the env variable
//! `UPDATE_EXPECT=1` set.
//! See https://docs.rs/expect-test/ for more information.
use om_wikiparser::html::{pretty_print, simplify_html};
use om_wikiparser::html::{pretty_print, simplify, simplify_html, HtmlError};
use expect_test::{expect_file, ExpectFile};
use scraper::Html;
@ -31,3 +31,30 @@ fn simplify_thoor_ballylee() {
expect_file!["./data/Q4185820-en/output.html"],
);
}
#[test]
fn not_redirect_crimean_mountains() {
let article = include_str!("./data/Q748282-en/original.html");
assert!(simplify(article, "en").is_ok());
}
#[test]
fn not_redirect_thoor_ballylee() {
let article = include_str!("./data/Q4185820-en/original.html");
assert!(simplify(article, "en").is_ok());
}
#[test]
fn is_redirect_abdalcık_aşkale() {
let article = include_str!("./data/redirects/Abdalc%C4%B1k%2C%20A%C5%9Fkale.html");
assert_eq!(
Err(HtmlError::Redirect("Aşkale".into())),
simplify(article, "en")
);
}
#[test]
fn is_empty_bahnstrecke_bassum_herford() {
let article = include_str!("./data/redirects/Bahnstrecke%20Bassum%FF%FF%FFHerford.html");
assert_eq!(Err(HtmlError::NoText), simplify(article, "en"));
}