Add checks for article redirects, empty articles, and sniff language
Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
e31027893e
commit
1da8ec212a
5 changed files with 106 additions and 3 deletions
|
@ -304,6 +304,7 @@ fn write(
|
|||
bail!("panic occurred while processing html (saved to {error_file:?})");
|
||||
}
|
||||
}
|
||||
Err(e) => bail!(e),
|
||||
}
|
||||
} else {
|
||||
page.article_body.html.to_string()
|
||||
|
|
71
src/html.rs
71
src/html.rs
|
@ -15,6 +15,7 @@ use serde::Deserialize;
|
|||
|
||||
mod pretty;
|
||||
pub use pretty::pretty_print;
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct Config<'a> {
|
||||
|
@ -88,12 +89,74 @@ static ELEMENT_DENY_LIST: Lazy<Selector> = Lazy::new(|| {
|
|||
pub fn simplify(html: &str, lang: &str) -> Result<String, HtmlError> {
|
||||
panic::catch_unwind(|| {
|
||||
let mut document = Html::parse_document(html);
|
||||
if let Some(redirect) = detect_redirect(&document) {
|
||||
return Err(HtmlError::Redirect(redirect.to_owned()));
|
||||
}
|
||||
simplify_html(&mut document, lang);
|
||||
if !has_text(&document) {
|
||||
return Err(HtmlError::NoText);
|
||||
}
|
||||
Ok(document.html())
|
||||
})
|
||||
.map_err(PanicMsg::new)?
|
||||
}
|
||||
|
||||
/// Attempt to find target title of the article if it is a redirect.
|
||||
pub fn detect_redirect(document: &Html) -> Option<&str> {
|
||||
static REDIRECT: Lazy<Selector> =
|
||||
Lazy::new(|| Selector::parse(r#"link[rel="mw:PageProp/redirect"]"#).unwrap());
|
||||
|
||||
document.select(&REDIRECT).next().map(|el| {
|
||||
let href = el.value().attr("href").unwrap_or_default().trim();
|
||||
let redirect = href.strip_prefix("./").unwrap_or(href);
|
||||
redirect
|
||||
})
|
||||
}
|
||||
|
||||
/// Attempt to find the wikipedia language of the article.
|
||||
pub fn detect_lang(document: &Html) -> Option<String> {
|
||||
static BASE: Lazy<Selector> = Lazy::new(|| Selector::parse("head > base[href]").unwrap());
|
||||
|
||||
document
|
||||
.select(&BASE)
|
||||
.next()
|
||||
.and_then(|el| el.value().attr("href"))
|
||||
.and_then(|url| {
|
||||
let mut url = url.to_owned();
|
||||
if url.starts_with("//") {
|
||||
url.insert_str(0, "http:");
|
||||
}
|
||||
|
||||
match Url::parse(&url) {
|
||||
Err(e) => {
|
||||
trace!("Error parsing base lang url: {}", e);
|
||||
None
|
||||
}
|
||||
Ok(url) => {
|
||||
let domain = url.domain()?;
|
||||
let (lang, domain) = domain.split_once('.')?;
|
||||
if domain != "wikipedia.org" {
|
||||
trace!("Domain of base lang url is not wikipedia.org: {}", domain);
|
||||
}
|
||||
Some(lang.to_owned())
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn has_text(document: &Html) -> bool {
|
||||
if let Some(root) = ElementRef::wrap(document.tree.root()) {
|
||||
!is_empty_or_whitespace(&root)
|
||||
} else {
|
||||
!document
|
||||
.tree
|
||||
.root()
|
||||
.children()
|
||||
.filter_map(ElementRef::wrap)
|
||||
.all(|el| is_empty_or_whitespace(&el))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn simplify_html(document: &mut Html, lang: &str) {
|
||||
if let Some(titles) = CONFIG.sections_to_remove.get(lang) {
|
||||
remove_sections(document, titles);
|
||||
|
@ -339,7 +402,7 @@ fn expand_id(document: &mut Html, id: NodeId) {
|
|||
node.detach();
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PanicMsg(Cow<'static, str>);
|
||||
|
||||
impl PanicMsg {
|
||||
|
@ -370,11 +433,15 @@ impl Deref for PanicMsg {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[derive(Debug, PartialEq, thiserror::Error)]
|
||||
pub enum HtmlError {
|
||||
/// Processing this HTML caused a panic in an underlying library
|
||||
#[error("panicked while processing html")]
|
||||
Panic(#[from] PanicMsg),
|
||||
#[error("page is redirect stub for {0:?}")]
|
||||
Redirect(String),
|
||||
#[error("page has no text after processing")]
|
||||
NoText,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
2
tests/data/redirects/Abdalc%C4%B1k%2C%20A%C5%9Fkale.html
Normal file
2
tests/data/redirects/Abdalc%C4%B1k%2C%20A%C5%9Fkale.html
Normal file
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE html>
|
||||
<html prefix="dc: http://purl.org/dc/terms/ mw: http://mediawiki.org/rdf/" about="https://en.wikipedia.org/wiki/Special:Redirect/revision/1018506873"><head prefix="mwr: https://en.wikipedia.org/wiki/Special:Redirect/"><meta property="mw:TimeUuid" content="d2663280-0a92-11ee-9251-69f636c11cde"/><meta charset="utf-8"/><meta property="mw:pageId" content="67162846"/><meta property="mw:pageNamespace" content="0"/><link rel="dc:replaces" resource="mwr:revision/1015759684"/><meta property="mw:revisionSHA1" content="efbb4024f27618d5a8547d1fc093c16f7de0129f"/><meta property="dc:modified" content="2021-04-18T12:27:23.000Z"/><meta property="mw:htmlVersion" content="2.8.0"/><meta property="mw:html:version" content="2.8.0"/><link rel="dc:isVersionOf" href="//en.wikipedia.org/wiki/Abdalc%C4%B1k%2C_A%C5%9Fkale"/><base href="//en.wikipedia.org/wiki/"/><title>Abdalcık, Aşkale</title><link rel="stylesheet" href="/w/load.php?lang=en&modules=mediawiki.skinning.content.parsoid%7Cmediawiki.skinning.interface%7Csite.styles&only=styles&skin=vector"/><meta http-equiv="content-language" content="en"/><meta http-equiv="vary" content="Accept"/></head><body id="mwAA" lang="en" class="mw-content-ltr sitedir-ltr ltr mw-body-content parsoid-body mediawiki mw-parser-output" dir="ltr"><section data-mw-section-id="0" id="mwAQ"><link rel="mw:PageProp/redirect" href="./Aşkale" id="mwAg"/></section></body></html>
|
|
@ -0,0 +1,6 @@
|
|||
<!DOCTYPE html>
|
||||
<html prefix="dc: http://purl.org/dc/terms/ mw: http://mediawiki.org/rdf/" about="https://de.wikipedia.org/wiki/Special:Redirect/revision/216559427"><head prefix="mwr: https://de.wikipedia.org/wiki/Special:Redirect/"><meta property="mw:TimeUuid" content="82157de0-c4d8-11ed-90e2-252585de4b86"/><meta charset="utf-8"/><meta property="mw:pageId" content="11692067"/><meta property="mw:pageNamespace" content="0"/><link rel="dc:replaces" resource="mwr:revision/209688171"/><meta property="mw:revisionSHA1" content="490f5886f51d04c0259915fab3d5e09681c95ac7"/><meta property="dc:modified" content="2021-10-21T14:42:03.000Z"/><meta property="mw:htmlVersion" content="2.7.0"/><meta property="mw:html:version" content="2.7.0"/><link rel="dc:isVersionOf" href="//de.wikipedia.org/wiki/Bahnstrecke_Bassum%E2%80%93Herford"/><base href="//de.wikipedia.org/wiki/"/><title>Bahnstrecke Bassum–Herford</title><link rel="stylesheet" href="/w/load.php?lang=de&modules=mediawiki.skinning.content.parsoid%7Cmediawiki.skinning.interface%7Csite.styles&only=styles&skin=vector"/><meta http-equiv="content-language" content="de"/><meta http-equiv="vary" content="Accept"/></head><body id="mwAA" lang="de" class="mw-content-ltr sitedir-ltr ltr mw-body-content parsoid-body mediawiki mw-parser-output" dir="ltr"><section data-mw-section-id="0" id="mwAQ"><div class="hintergrundfarbe1 rahmenfarbe1 navigation-not-searchable navigation-not-searchable" style="border-style: solid; border-width: 1px; clear: left; margin-bottom:1em; margin-top:1em; padding: 0.25em; overflow: hidden; word-break: break-word; word-wrap: break-word; " id="Vorlage_Falschschreibung" about="#mwt1" typeof="mw:Transclusion" data-mw='{"parts":[{"template":{"target":{"wt":"Falschschreibung","href":"./Vorlage:Falschschreibung"},"params":{"Alternativtext":{"wt":"[[Bahnstrecke Bünde–Bassum]] oder [[Bahnstrecke Herford–Kirchlengern]]"}},"i":0}}]}'><div class="noviewer nomobile" style="display: table-cell; padding-bottom: 0.2em; padding-left: 0.25em; padding-right: 1em; padding-top: 0.2em; vertical-align: middle;" aria-hidden="true" role="presentation"><span typeof="mw:File" data-mw='{"caption":"Falschschreibung"}'><a href="./Datei:Gtk-dialog-info.svg" class="mw-file-description" title="Falschschreibung"><img alt="" resource="./Datei:Gtk-dialog-info.svg" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b4/Gtk-dialog-info.svg/25px-Gtk-dialog-info.svg.png" decoding="async" data-file-width="60" data-file-height="60" data-file-type="drawing" height="25" width="25" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b4/Gtk-dialog-info.svg/38px-Gtk-dialog-info.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/b4/Gtk-dialog-info.svg/50px-Gtk-dialog-info.svg.png 2x"/></a></span></div>
|
||||
<div style="display: table-cell; vertical-align: middle; width: 100%;">
|
||||
<div role="navigation">
|
||||
Zu diesem Stichwort ist kein Artikel vorhanden; möglicherweise ist <a rel="mw:WikiLink" href="./Bahnstrecke_Bünde–Bassum" title="Bahnstrecke Bünde–Bassum">Bahnstrecke Bünde–Bassum</a> oder <a rel="mw:WikiLink" href="./Bahnstrecke_Herford–Kirchlengern" title="Bahnstrecke Herford–Kirchlengern">Bahnstrecke Herford–Kirchlengern</a> gemeint.</div>
|
||||
</div></div><link rel="mw:PageProp/Category" href="./Kategorie:Wikipedia:Falschschreibung" about="#mwt1"/><meta property="mw:PageProp/disambiguation" about="#mwt1"/><span about="#mwt1"> </span><meta property="mw:PageProp/expectedUnconnectedPage" about="#mwt1"/></section></body></html>
|
|
@ -3,7 +3,7 @@
|
|||
//! To update the expected output, run the test again with the env variable
|
||||
//! `UPDATE_EXPECT=1` set.
|
||||
//! See https://docs.rs/expect-test/ for more information.
|
||||
use om_wikiparser::html::{pretty_print, simplify_html};
|
||||
use om_wikiparser::html::{pretty_print, simplify, simplify_html, HtmlError};
|
||||
|
||||
use expect_test::{expect_file, ExpectFile};
|
||||
use scraper::Html;
|
||||
|
@ -31,3 +31,30 @@ fn simplify_thoor_ballylee() {
|
|||
expect_file!["./data/Q4185820-en/output.html"],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn not_redirect_crimean_mountains() {
|
||||
let article = include_str!("./data/Q748282-en/original.html");
|
||||
assert!(simplify(article, "en").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn not_redirect_thoor_ballylee() {
|
||||
let article = include_str!("./data/Q4185820-en/original.html");
|
||||
assert!(simplify(article, "en").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_redirect_abdalcık_aşkale() {
|
||||
let article = include_str!("./data/redirects/Abdalc%C4%B1k%2C%20A%C5%9Fkale.html");
|
||||
assert_eq!(
|
||||
Err(HtmlError::Redirect("Aşkale".into())),
|
||||
simplify(article, "en")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_empty_bahnstrecke_bassum_herford() {
|
||||
let article = include_str!("./data/redirects/Bahnstrecke%20Bassum%FF%FF%FFHerford.html");
|
||||
assert_eq!(Err(HtmlError::NoText), simplify(article, "en"));
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue