diff --git a/src/html.rs b/src/html.rs index 78071bd..200c45b 100644 --- a/src/html.rs +++ b/src/html.rs @@ -125,6 +125,8 @@ pub fn simplify_html(document: &mut Html, lang: &str) { } remove_ids(document, to_remove.drain(..)); + remove_empty_sections(document); + remove_comments(document); expand_links(document); @@ -152,6 +154,39 @@ fn remove_comments(document: &mut Html) { remove_ids(document, to_remove.drain(..)); } +fn remove_empty_sections(document: &mut Html) { + let mut to_remove = Vec::new(); + for el in document.select(&HEADERS) { + // TODO: does select match on detached nodes? + let Some(parent) = el.parent() else { continue; }; + + if !parent + .value() + .as_element() + .map(|p| p.name() == "section") + .unwrap_or_default() + { + trace!("Skipping header without section name: {:?}", parent); + continue; + } + + if el + .next_siblings() + .filter_map(ElementRef::wrap) + .all(|e| is_empty_or_whitespace(&e) || HEADERS.matches(&e)) + { + trace!( + "Removing empty section {} {:?}", + el.value().name(), + el.text().collect::() + ); + to_remove.push(parent.id()); + } + } + + remove_ids(document, to_remove); +} + fn remove_attrs(document: &mut Html) { // TODO: See if finding and skipping detached nodes is significantly faster. let mut to_remove = Vec::new(); diff --git a/tests/data/Q748282-en/output.html b/tests/data/Q748282-en/output.html index 0762b05..327f1f6 100644 --- a/tests/data/Q748282-en/output.html +++ b/tests/data/Q748282-en/output.html @@ -92,7 +92,4 @@

Archaeologists have found the earliest anatomically modern humans in Europe in the Crimean Mountains' Buran-Kaya caves. The fossils are 32,000 years old, with the artifacts linked to the Gravettian culture. The fossils have cut marks suggesting a post-mortem defleshing ritual.

- \ No newline at end of file