Remove images and links
See #11 for next steps Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
9036e3413f
commit
45efd77c0d
6 changed files with 103 additions and 12 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -522,6 +522,7 @@ version = "0.0.0"
|
|||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"ego-tree",
|
||||
"env_logger",
|
||||
"log",
|
||||
"once_cell",
|
||||
|
|
|
@ -10,6 +10,7 @@ default-run = "om-wikiparser"
|
|||
[dependencies]
|
||||
anyhow = { version = "1.0.71", features = ["backtrace"] }
|
||||
clap = { version = "4.3.2", features = ["derive"] }
|
||||
ego-tree = "0.6.2"
|
||||
env_logger = "0.10.0"
|
||||
log = "0.4.18"
|
||||
once_cell = "1.18.0"
|
||||
|
|
|
@ -35,6 +35,10 @@ As an example of usage with the map generator:
|
|||
# Transform intermediate files from generator.
|
||||
cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
|
||||
tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
|
||||
# Enable backtraces in errors and panics.
|
||||
export RUST_BACKTRACE=1
|
||||
# Set log level to debug
|
||||
export RUST_LOG=om_wikiparser=debug
|
||||
# Begin extraction.
|
||||
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
|
||||
do
|
||||
|
|
|
@ -7,6 +7,11 @@ use std::io::{stdin, stdout, Read, Write};
|
|||
use om_wikiparser::html::simplify;
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
env_logger::Builder::new()
|
||||
.filter_level(log::LevelFilter::Info)
|
||||
.parse_default_env()
|
||||
.try_init()?;
|
||||
|
||||
let mut input = String::new();
|
||||
stdin().read_to_string(&mut input)?;
|
||||
|
||||
|
|
102
src/html.rs
102
src/html.rs
|
@ -1,5 +1,6 @@
|
|||
use std::collections::{BTreeMap, BTreeSet};
|
||||
|
||||
use ego_tree::NodeId;
|
||||
use once_cell::sync::Lazy;
|
||||
use scraper::{ElementRef, Html, Selector};
|
||||
use serde::Deserialize;
|
||||
|
@ -51,34 +52,65 @@ pub fn simplify(html: &str, lang: &str) -> String {
|
|||
}
|
||||
}
|
||||
|
||||
for id in to_remove.drain(..) {
|
||||
if let Some(mut node) = document.tree.get_mut(id) {
|
||||
node.detach();
|
||||
}
|
||||
}
|
||||
remove_ids(&mut document, to_remove.drain(..));
|
||||
} else {
|
||||
warn!("No sections to remove configured for lang {lang:?}");
|
||||
}
|
||||
|
||||
// Remove elements with no text that isn't whitespace.
|
||||
|
||||
for element in document
|
||||
for el in document
|
||||
.root_element()
|
||||
.descendants()
|
||||
.filter_map(ElementRef::wrap)
|
||||
{
|
||||
if element.text().all(|t| t.trim().is_empty()) {
|
||||
to_remove.push(element.id());
|
||||
if is_image(&el) || is_empty_or_whitespace(&el) {
|
||||
to_remove.push(el.id());
|
||||
}
|
||||
}
|
||||
remove_ids(&mut document, to_remove.drain(..));
|
||||
|
||||
for id in to_remove.drain(..) {
|
||||
remove_links(&mut document);
|
||||
|
||||
document.html()
|
||||
}
|
||||
|
||||
fn remove_ids(document: &mut Html, ids: impl IntoIterator<Item = NodeId>) {
|
||||
for id in ids {
|
||||
if let Some(mut node) = document.tree.get_mut(id) {
|
||||
node.detach();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
document.html()
|
||||
fn is_empty_or_whitespace(el: &ElementRef) -> bool {
|
||||
el.text().flat_map(str::chars).all(char::is_whitespace)
|
||||
}
|
||||
|
||||
fn is_image(el: &ElementRef) -> bool {
|
||||
["img", "picture"].contains(&el.value().name())
|
||||
}
|
||||
|
||||
/// Remove all links, preserving any inner elements/text.
|
||||
fn remove_links(document: &mut Html) {
|
||||
let links: Vec<_> = document
|
||||
.select(&Selector::parse("a").unwrap())
|
||||
.map(|el| el.id())
|
||||
.collect();
|
||||
|
||||
for id in links {
|
||||
let Some(mut node) = document.tree.get_mut(id) else { continue };
|
||||
if node.parent().is_none() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// reparent to same location as node
|
||||
while let Some(mut child) = node.first_child() {
|
||||
let child_id = child.id();
|
||||
child.detach();
|
||||
node.insert_id_before(child_id);
|
||||
}
|
||||
|
||||
node.detach();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -89,4 +121,50 @@ mod test {
|
|||
fn static_config_parses() {
|
||||
assert!(!CONFIG.sections_to_remove.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn remove_links() {
|
||||
let html = r#"
|
||||
<p> Some text that includes
|
||||
<a href="Some_Page"><span id="inner-content">several</span></a>
|
||||
<a id="second-link" href="./Another_Page">relative links</a>
|
||||
and
|
||||
<a href="https://example.com/page">an absolute link</a>
|
||||
.
|
||||
</p>
|
||||
"#;
|
||||
|
||||
let anchors = Selector::parse("a").unwrap();
|
||||
let inner_element = Selector::parse("#inner-content").unwrap();
|
||||
let second_link = Selector::parse("#second-link").unwrap();
|
||||
|
||||
let mut document = Html::parse_fragment(html);
|
||||
let links: Vec<_> = document
|
||||
.select(&anchors)
|
||||
.filter_map(|el| el.value().attr("href"))
|
||||
.collect();
|
||||
|
||||
eprintln!("{}", document.html());
|
||||
|
||||
assert_eq!(
|
||||
vec!["Some_Page", "./Another_Page", "https://example.com/page"],
|
||||
links,
|
||||
"Links in original html are not expected."
|
||||
);
|
||||
|
||||
// Detach one of the links from the root tree (as if previously deleted) to ensure it handles orphan nodes nicely.
|
||||
let link = document.select(&second_link).next().unwrap().id();
|
||||
document.tree.get_mut(link).unwrap().detach();
|
||||
|
||||
super::remove_links(&mut document);
|
||||
|
||||
let links: Vec<_> = document.select(&anchors).collect();
|
||||
|
||||
assert!(links.is_empty(), "All links should be removed.");
|
||||
|
||||
assert!(
|
||||
document.select(&inner_element).next().is_some(),
|
||||
"Link inner elements should be preserved."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -138,6 +138,8 @@ fn write(
|
|||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
// Use info level by default, load overrides from `RUST_LOG` env variable.
|
||||
// See https://docs.rs/env_logger/latest/env_logger/index.html#example
|
||||
env_logger::Builder::new()
|
||||
.filter_level(log::LevelFilter::Info)
|
||||
.parse_default_env()
|
||||
|
|
Loading…
Add table
Reference in a new issue