From bab29c0de9674f9fd77b851ecfcb0050574ea6f1 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Mon, 8 Jul 2024 14:19:16 -0400 Subject: [PATCH] Preserve whitespace of removed "empty" elements Some articles use non-breaking spaces between quantities and units, which Wikipedia seems to wrap with a span. Elements with no or whitespace-only text were previously removed to prune ``s and parents of other removed elements. This fix preserves the internal whitespace of elements that would otherwise be removed for being "empty". It does not distinguish between "meaningful" whitespace and padding between elements that would be collapsed by HTML formatting rules. It also cannot distinguish between elements that _started_ with only whitespace and nodes that now contain only whitespace after previous steps. The preserved whitespace in the latter case is unlikely to remain because of later processing steps. Fixes #47, fixes organicmaps/organicmaps#8651 Signed-off-by: Evan Lloyd New-Schmidt --- src/html.rs | 9 ++++++--- tests/data/Q748282-en/output.html | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/html.rs b/src/html.rs index d196082..20a9aa6 100644 --- a/src/html.rs +++ b/src/html.rs @@ -202,7 +202,7 @@ pub fn simplify(document: &mut Html, lang: &str) { remove_empty_sections(document); - remove_empty(document); + expand_empty(document); remove_non_element_nodes(document); @@ -305,7 +305,8 @@ fn remove_toplevel_whitespace(document: &mut Html) { remove_ids(document, to_remove.drain(..)); } -fn remove_empty(document: &mut Html) { +/// Expand elements that contain no text or only whitespace, leaving only their contents. +fn expand_empty(document: &mut Html) { let mut to_remove = Vec::new(); for el in document @@ -318,7 +319,9 @@ fn remove_empty(document: &mut Html) { } } - remove_ids(document, to_remove.drain(..)); + for id in to_remove.drain(..) { + expand_id(document, id); + } } fn remove_empty_sections(document: &mut Html) { diff --git a/tests/data/Q748282-en/output.html b/tests/data/Q748282-en/output.html index b98dbba..1ec2e31 100644 --- a/tests/data/Q748282-en/output.html +++ b/tests/data/Q748282-en/output.html @@ -7,7 +7,7 @@
  • Chatyr-Dag yayla
  • Dologorukovskaya (Subatkan) yayla
  • Demirci yayla
  • -
  • Qarabiy yayla
  • Highest peaks

    The Crimea's highest peak is the Roman-Kosh (Ukrainian: Роман-Кош; Russian: Роман-Кош, Crimean Tatar: Roman Qoş) on the Babugan Yayla at 1,545 metres (5,069ft). Other important peaks over 1,200 metres include:

    Highest peaks

    The Crimea's highest peak is the Roman-Kosh (Ukrainian: Роман-Кош; Russian: Роман-Кош, Crimean Tatar: Roman Qoş) on the Babugan Yayla at 1,545 metres (5,069 ft). Other important peaks over 1,200 metres include: