Remove images and links

See #11 for next steps

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-06-29 15:41:03 -04:00 committed by Evan Lloyd New-Schmidt
parent 9036e3413f
commit 45efd77c0d
6 changed files with 103 additions and 12 deletions

1
Cargo.lock generated
View file

@ -522,6 +522,7 @@ version = "0.0.0"
dependencies = [
"anyhow",
"clap",
"ego-tree",
"env_logger",
"log",
"once_cell",

View file

@ -10,6 +10,7 @@ default-run = "om-wikiparser"
[dependencies]
anyhow = { version = "1.0.71", features = ["backtrace"] }
clap = { version = "4.3.2", features = ["derive"] }
ego-tree = "0.6.2"
env_logger = "0.10.0"
log = "0.4.18"
once_cell = "1.18.0"

View file

@ -35,6 +35,10 @@ As an example of usage with the map generator:
# Transform intermediate files from generator.
cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
# Enable backtraces in errors and panics.
export RUST_BACKTRACE=1
# Set log level to debug
export RUST_LOG=om_wikiparser=debug
# Begin extraction.
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
do

View file

@ -7,6 +7,11 @@ use std::io::{stdin, stdout, Read, Write};
use om_wikiparser::html::simplify;
fn main() -> anyhow::Result<()> {
env_logger::Builder::new()
.filter_level(log::LevelFilter::Info)
.parse_default_env()
.try_init()?;
let mut input = String::new();
stdin().read_to_string(&mut input)?;

View file

@ -1,5 +1,6 @@
use std::collections::{BTreeMap, BTreeSet};
use ego_tree::NodeId;
use once_cell::sync::Lazy;
use scraper::{ElementRef, Html, Selector};
use serde::Deserialize;
@ -51,34 +52,65 @@ pub fn simplify(html: &str, lang: &str) -> String {
}
}
for id in to_remove.drain(..) {
if let Some(mut node) = document.tree.get_mut(id) {
node.detach();
}
}
remove_ids(&mut document, to_remove.drain(..));
} else {
warn!("No sections to remove configured for lang {lang:?}");
}
// Remove elements with no text that isn't whitespace.
for element in document
for el in document
.root_element()
.descendants()
.filter_map(ElementRef::wrap)
{
if element.text().all(|t| t.trim().is_empty()) {
to_remove.push(element.id());
if is_image(&el) || is_empty_or_whitespace(&el) {
to_remove.push(el.id());
}
}
remove_ids(&mut document, to_remove.drain(..));
for id in to_remove.drain(..) {
remove_links(&mut document);
document.html()
}
fn remove_ids(document: &mut Html, ids: impl IntoIterator<Item = NodeId>) {
for id in ids {
if let Some(mut node) = document.tree.get_mut(id) {
node.detach();
}
}
}
document.html()
fn is_empty_or_whitespace(el: &ElementRef) -> bool {
el.text().flat_map(str::chars).all(char::is_whitespace)
}
fn is_image(el: &ElementRef) -> bool {
["img", "picture"].contains(&el.value().name())
}
/// Remove all links, preserving any inner elements/text.
fn remove_links(document: &mut Html) {
let links: Vec<_> = document
.select(&Selector::parse("a").unwrap())
.map(|el| el.id())
.collect();
for id in links {
let Some(mut node) = document.tree.get_mut(id) else { continue };
if node.parent().is_none() {
continue;
}
// reparent to same location as node
while let Some(mut child) = node.first_child() {
let child_id = child.id();
child.detach();
node.insert_id_before(child_id);
}
node.detach();
}
}
#[cfg(test)]
@ -89,4 +121,50 @@ mod test {
fn static_config_parses() {
assert!(!CONFIG.sections_to_remove.is_empty());
}
#[test]
fn remove_links() {
let html = r#"
<p> Some text that includes
<a href="Some_Page"><span id="inner-content">several</span></a>
<a id="second-link" href="./Another_Page">relative links</a>
and
<a href="https://example.com/page">an absolute link</a>
.
</p>
"#;
let anchors = Selector::parse("a").unwrap();
let inner_element = Selector::parse("#inner-content").unwrap();
let second_link = Selector::parse("#second-link").unwrap();
let mut document = Html::parse_fragment(html);
let links: Vec<_> = document
.select(&anchors)
.filter_map(|el| el.value().attr("href"))
.collect();
eprintln!("{}", document.html());
assert_eq!(
vec!["Some_Page", "./Another_Page", "https://example.com/page"],
links,
"Links in original html are not expected."
);
// Detach one of the links from the root tree (as if previously deleted) to ensure it handles orphan nodes nicely.
let link = document.select(&second_link).next().unwrap().id();
document.tree.get_mut(link).unwrap().detach();
super::remove_links(&mut document);
let links: Vec<_> = document.select(&anchors).collect();
assert!(links.is_empty(), "All links should be removed.");
assert!(
document.select(&inner_element).next().is_some(),
"Link inner elements should be preserved."
);
}
}

View file

@ -138,6 +138,8 @@ fn write(
}
fn main() -> anyhow::Result<()> {
// Use info level by default, load overrides from `RUST_LOG` env variable.
// See https://docs.rs/env_logger/latest/env_logger/index.html#example
env_logger::Builder::new()
.filter_level(log::LevelFilter::Info)
.parse_default_env()