Remove empty sections after other removals

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-08-15 18:36:02 -04:00 committed by Evan Lloyd New-Schmidt
parent cc3ae9b629
commit 58f32b43fd
2 changed files with 35 additions and 3 deletions

View file

@ -125,6 +125,8 @@ pub fn simplify_html(document: &mut Html, lang: &str) {
}
remove_ids(document, to_remove.drain(..));
remove_empty_sections(document);
remove_comments(document);
expand_links(document);
@ -152,6 +154,39 @@ fn remove_comments(document: &mut Html) {
remove_ids(document, to_remove.drain(..));
}
fn remove_empty_sections(document: &mut Html) {
let mut to_remove = Vec::new();
for el in document.select(&HEADERS) {
// TODO: does select match on detached nodes?
let Some(parent) = el.parent() else { continue; };
if !parent
.value()
.as_element()
.map(|p| p.name() == "section")
.unwrap_or_default()
{
trace!("Skipping header without section name: {:?}", parent);
continue;
}
if el
.next_siblings()
.filter_map(ElementRef::wrap)
.all(|e| is_empty_or_whitespace(&e) || HEADERS.matches(&e))
{
trace!(
"Removing empty section {} {:?}",
el.value().name(),
el.text().collect::<String>()
);
to_remove.push(parent.id());
}
}
remove_ids(document, to_remove);
}
fn remove_attrs(document: &mut Html) {
// TODO: See if finding and skipping detached nodes is significantly faster.
let mut to_remove = Vec::new();

View file

@ -92,7 +92,4 @@
<p>
Archaeologists have found the earliest anatomically modern humans in Europe in the Crimean Mountains' Buran-Kaya caves. The fossils are 32,000 years old, with the artifacts linked to the Gravettian culture. The fossils have cut marks suggesting a post-mortem defleshing ritual.
</p>
<h2 id="Gallery">
Gallery
</h2>
</html>