diff --git a/src/html.rs b/src/html.rs index 2684c49..d1b7c3a 100644 --- a/src/html.rs +++ b/src/html.rs @@ -71,6 +71,8 @@ static ELEMENT_DENY_LIST: Lazy = Lazy::new(|| { "embed", // Pronunciation "listen" link/button. r#"span[typeof="mw:Transclusion"][data-mw*="\"audio\":"]"#, + // Coordinates transclusion. + "span#coordinates", ] .join(", "), ) @@ -121,9 +123,7 @@ pub fn simplify_html(document: &mut Html, lang: &str) { .descendants() .filter_map(ElementRef::wrap) { - if (ELEMENT_DENY_LIST.matches(&el) || is_empty_or_whitespace(&el)) - && !ELEMENT_ALLOW_LIST.matches(&el) - { + if ELEMENT_DENY_LIST.matches(&el) && !ELEMENT_ALLOW_LIST.matches(&el) { to_remove.push(el.id()); } } @@ -131,6 +131,8 @@ pub fn simplify_html(document: &mut Html, lang: &str) { remove_empty_sections(document); + remove_empty(document); + remove_non_element_nodes(document); expand_links(document); @@ -187,6 +189,22 @@ fn remove_toplevel_whitespace(document: &mut Html) { remove_ids(document, to_remove.drain(..)); } +fn remove_empty(document: &mut Html) { + let mut to_remove = Vec::new(); + + for el in document + .root_element() + .descendants() + .filter_map(ElementRef::wrap) + { + if is_empty_or_whitespace(&el) { + to_remove.push(el.id()); + } + } + + remove_ids(document, to_remove.drain(..)); +} + fn remove_empty_sections(document: &mut Html) { let mut to_remove = Vec::new(); for el in document.select(&HEADERS) { diff --git a/tests/data/Q4185820-en/output.html b/tests/data/Q4185820-en/output.html index c9c5590..a968275 100644 --- a/tests/data/Q4185820-en/output.html +++ b/tests/data/Q4185820-en/output.html @@ -1,7 +1,3 @@ -

- - Coordinates: 53°06′11.4″N08°46′29.2″W / 53.103167°N 8.774778°W / 53.103167; -8.774778 -

Thoor Ballylee Castle (IrishTúr Bhaile Uí Laí) is a fortified, 15th-century Anglo-Normantower house built by the septsde Burgo, or Burke, near the town of Gort in County Galway, Ireland. It is also known as Yeats' Tower because it was once owned and inhabited by the poet William Butler Yeats.