diff --git a/src/html.rs b/src/html.rs
index 39b5bdc..a1b6c89 100644
--- a/src/html.rs
+++ b/src/html.rs
@@ -1,3 +1,13 @@
+//! Simplification of Wikipedia Enterprise HTML.
+//!
+//! The goal is to process the [Enterprise API] HTML output similar to the [TextExtracts API](https://www.mediawiki.org/wiki/Extension:TextExtracts).
+//! In particular:
+//! - All images, media, tables, and wrapper elements like divs and sections are removed.
+//! - Doctype, comments, `html`, `body`, `head`, etc. are removed.
+//! - Only top-level headers, paragraphs, links and basic text formatting (`b`, `i`, etc.)
+//!
+//! The HTML that the TextExtracts API starts with seems to be different from the Enterprise API, which follows [this spec](https://www.mediawiki.org/wiki/Specs/HTML/) and seems to have more content and data encoded in attributes.
+
use std::{
any::Any,
borrow::Cow,
@@ -187,7 +197,7 @@ pub fn has_text(document: &Html) -> bool {
/// handles panics and other errors.
pub fn simplify(document: &mut Html, lang: &str) {
if let Some(titles) = CONFIG.sections_to_remove.get(lang) {
- remove_sections(document, titles);
+ remove_named_header_siblings(document, titles);
}
remove_denylist_elements(document);
@@ -213,43 +223,34 @@ fn remove_ids(document: &mut Html, ids: impl IntoIterator- ) {
}
}
-/// Remove sections with the specified `titles` and all trailing elements until next section.
+/// Remove headers with the specified `titles` and all following siblings until the next header greater or equal level.
///
/// `titles` are matched by case-sensitive simple byte comparison.
/// `titles` should be normalized to Unicode NFC to match Wikipedia's internal normalization: .
-fn remove_sections(document: &mut Html, titles: &BTreeSet<&str>) {
+fn remove_named_header_siblings(document: &mut Html, titles: &BTreeSet<&str>) {
let mut to_remove = Vec::new();
for header in document.select(&HEADERS) {
- let Some(parent) = header.parent() else {
- continue;
- };
-
- if !parent
- .value()
- .as_element()
- .map(|p| p.name() == "section")
- .unwrap_or_default()
- {
- trace!("Skipping header without section name: {:?}", parent);
- continue;
- }
-
// TODO: Should this join all text nodes?
let Some(title) = header.text().next() else {
continue;
};
-
if !titles.contains(title) {
continue;
}
- trace!(
- "Removing denylisted section {} {:?}",
- header.value().name(),
- header.text().collect::()
- );
- to_remove.push(parent.id());
+ to_remove.push(header.id());
+ let header_level = header.value().name();
+ trace!("Removing trailing siblings for header {header_level} {title:?}");
+ for sibling in header.next_siblings() {
+ if let Some(element) = ElementRef::wrap(sibling) {
+ if HEADERS.matches(&element) && element.value().name() <= header_level {
+ trace!("Stopping removal early at {}", element.value().name(),);
+ break;
+ }
+ }
+ to_remove.push(sibling.id());
+ }
}
remove_ids(document, to_remove.drain(..));
@@ -511,7 +512,7 @@ mod test {
/// text, which this does not handle.
///
/// See also:
- /// - [super::remove_sections]
+ /// - [super::remove_named_header_siblings]
/// - Mediawiki discussion of normalization: https://mediawiki.org/wiki/Unicode_normalization_considerations
/// - Online conversion tool: https://util.unicode.org/UnicodeJsps/transform.jsp?a=Any-NFC
#[test]
@@ -590,4 +591,47 @@ mod test {
"Link inner elements should be preserved."
);
}
+
+ #[test]
+ /// Verify trailing siblings are removed up to superheaders.
+ fn remove_headers() {
+ let html = r#"
+
Title
+ Foo bar
+ Section 1
+ Foo bar
+ Subsection
+ Foo bar
+ Next Title
+ Foo bar
+ Section 2
+ Foo bar
+ "#;
+
+ let paragraphs = Selector::parse("p").unwrap();
+
+ let mut document = Html::parse_fragment(html);
+
+ assert_eq!(
+ vec!["p1", "p2", "p3", "p4", "p5"],
+ document
+ .select(¶graphs)
+ .map(|el| el.value().id().unwrap_or_default())
+ .collect::>(),
+ "paragraphs in original html are not expected"
+ );
+
+ remove_named_header_siblings(&mut document, &BTreeSet::from_iter(Some("Section 1")));
+
+ eprintln!("{}", document.html());
+
+ assert_eq!(
+ vec!["p1", "p4", "p5"],
+ document
+ .select(¶graphs)
+ .map(|el| el.value().id().unwrap_or_default())
+ .collect::>(),
+ "only p2 and p3 should be removed"
+ );
+ }
}