2023-07-10 14:58:56 +00:00 · 2023-06-30 05:32:12 +00:00 · 2023-06-30 15:17:22 +00:00 · 2023-06-30 05:34:00 +00:00 · 2023-06-30 05:35:35 +00:00 · 2023-06-30 05:38:28 +00:00
6 changed files with 103 additions and 12 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -522,6 +522,7 @@ version = "0.0.0"
 dependencies = [
 "anyhow",
 "clap",
+ "ego-tree",
 "env_logger",
 "log",
 "once_cell",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -10,6 +10,7 @@ default-run = "om-wikiparser"
 [dependencies]
 anyhow = { version = "1.0.71", features = ["backtrace"] }
 clap = { version = "4.3.2", features = ["derive"] }
+ego-tree = "0.6.2"
 env_logger = "0.10.0"
 log = "0.4.18"
 once_cell = "1.18.0"
--- a/README.md
+++ b/README.md
@ -35,6 +35,10 @@ As an example of usage with the map generator:
 # Transform intermediate files from generator.
 cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
 tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
+# Enable backtraces in errors and panics.
+export RUST_BACKTRACE=1
+# Set log level to debug
+export RUST_LOG=om_wikiparser=debug
 # Begin extraction.
 for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
 do
--- a/src/bin/simplify_html.rs
+++ b/src/bin/simplify_html.rs
@ -7,6 +7,11 @@ use std::io::{stdin, stdout, Read, Write};
 use om_wikiparser::html::simplify;

 fn main() -> anyhow::Result<()> {
+    env_logger::Builder::new()
+        .filter_level(log::LevelFilter::Info)
+        .parse_default_env()
+        .try_init()?;
+
    let mut input = String::new();
    stdin().read_to_string(&mut input)?;

--- a/src/html.rs
+++ b/src/html.rs
@ -1,5 +1,6 @@
 use std::collections::{BTreeMap, BTreeSet};

+use ego_tree::NodeId;
 use once_cell::sync::Lazy;
 use scraper::{ElementRef, Html, Selector};
 use serde::Deserialize;
@ -51,34 +52,65 @@ pub fn simplify(html: &str, lang: &str) -> String {
            }
        }

-        for id in to_remove.drain(..) {
-            if let Some(mut node) = document.tree.get_mut(id) {
-                node.detach();
-            }
-        }
+        remove_ids(&mut document, to_remove.drain(..));
    } else {
        warn!("No sections to remove configured for lang {lang:?}");
    }

-    // Remove elements with no text that isn't whitespace.
-
-    for element in document
+    for el in document
        .root_element()
        .descendants()
        .filter_map(ElementRef::wrap)
    {
-        if element.text().all(|t| t.trim().is_empty()) {
-            to_remove.push(element.id());
+        if is_image(&el) || is_empty_or_whitespace(&el) {
+            to_remove.push(el.id());
        }
    }
+    remove_ids(&mut document, to_remove.drain(..));

-    for id in to_remove.drain(..) {
+    remove_links(&mut document);
+
+    document.html()
+}
+
+fn remove_ids(document: &mut Html, ids: impl IntoIterator<Item = NodeId>) {
+    for id in ids {
        if let Some(mut node) = document.tree.get_mut(id) {
            node.detach();
        }
    }
+}

-    document.html()
+fn is_empty_or_whitespace(el: &ElementRef) -> bool {
+    el.text().flat_map(str::chars).all(char::is_whitespace)
+}
+
+fn is_image(el: &ElementRef) -> bool {
+    ["img", "picture"].contains(&el.value().name())
+}
+
+/// Remove all links, preserving any inner elements/text.
+fn remove_links(document: &mut Html) {
+    let links: Vec<_> = document
+        .select(&Selector::parse("a").unwrap())
+        .map(|el| el.id())
+        .collect();
+
+    for id in links {
+        let Some(mut node) = document.tree.get_mut(id) else { continue };
+        if node.parent().is_none() {
+            continue;
+        }
+
+        // reparent to same location as node
+        while let Some(mut child) = node.first_child() {
+            let child_id = child.id();
+            child.detach();
+            node.insert_id_before(child_id);
+        }
+
+        node.detach();
+    }
 }

 #[cfg(test)]
@ -89,4 +121,50 @@ mod test {
    fn static_config_parses() {
        assert!(!CONFIG.sections_to_remove.is_empty());
    }
+
+    #[test]
+    fn remove_links() {
+        let html = r#"
+        <p> Some text that includes
+            <a href="Some_Page"><span id="inner-content">several</span></a>
+            <a id="second-link" href="./Another_Page">relative links</a>
+        and
+            <a href="https://example.com/page">an absolute link</a>
+        .
+        </p>
+        "#;
+
+        let anchors = Selector::parse("a").unwrap();
+        let inner_element = Selector::parse("#inner-content").unwrap();
+        let second_link = Selector::parse("#second-link").unwrap();
+
+        let mut document = Html::parse_fragment(html);
+        let links: Vec<_> = document
+            .select(&anchors)
+            .filter_map(|el| el.value().attr("href"))
+            .collect();
+
+        eprintln!("{}", document.html());
+
+        assert_eq!(
+            vec!["Some_Page", "./Another_Page", "https://example.com/page"],
+            links,
+            "Links in original html are not expected."
+        );
+
+        // Detach one of the links from the root tree (as if previously deleted) to ensure it handles orphan nodes nicely.
+        let link = document.select(&second_link).next().unwrap().id();
+        document.tree.get_mut(link).unwrap().detach();
+
+        super::remove_links(&mut document);
+
+        let links: Vec<_> = document.select(&anchors).collect();
+
+        assert!(links.is_empty(), "All links should be removed.");
+
+        assert!(
+            document.select(&inner_element).next().is_some(),
+            "Link inner elements should be preserved."
+        );
+    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -138,6 +138,8 @@ fn write(
 }

 fn main() -> anyhow::Result<()> {
+    // Use info level by default, load overrides from `RUST_LOG` env variable.
+    // See https://docs.rs/env_logger/latest/env_logger/index.html#example
    env_logger::Builder::new()
        .filter_level(log::LevelFilter::Info)
        .parse_default_env()