diff --git a/Cargo.lock b/Cargo.lock index 68dd02e..c814b23 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -524,6 +524,7 @@ dependencies = [ "clap", "env_logger", "log", + "once_cell", "scraper", "serde", "serde_json", @@ -533,9 +534,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.17.2" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9670a07f94779e00908f3e686eab508878ebb390ba6e604d3a284c00e8d0487b" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "parking_lot" diff --git a/Cargo.toml b/Cargo.toml index 3cc52c7..09f05f4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ anyhow = { version = "1.0.71", features = ["backtrace"] } clap = { version = "4.3.2", features = ["derive"] } env_logger = "0.10.0" log = "0.4.18" +once_cell = "1.18.0" scraper = "0.16.0" serde = { version = "1.0.163", features = ["derive"] } serde_json = "1.0.96" diff --git a/README.md b/README.md index 2c6ee28..9c95dd0 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ # wikiparser _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ + +## Usage + +[`article_processing_config.json`](article_processing_config.json) should be updated when adding a new language. +It defines article sections that are not important for users and should be removed. diff --git a/article_processing_config.json b/article_processing_config.json new file mode 100644 index 0000000..2222efb --- /dev/null +++ b/article_processing_config.json @@ -0,0 +1,44 @@ +{ + "sections_to_remove": { + "de": [ + "Anmerkungen", + "Anmerkungen und Einzelnachweise", + "Einzelbelege", + "Einzelnachweise", + "Filme", + "Literatur", + "Siehe auch", + "Weblinks" + ], + "en": [ + "Bibliography", + "External links", + "Further reading", + "References", + "See also", + "Sources" + ], + "es": [ + "Enlaces externos", + "Referencias", + "Véase también", + "Vínculos de interés" + ], + "fr": [ + "Articles connexes", + "Bibliographie", + "Lien externe", + "Liens externes", + "Notes et références", + "Références", + "Voir aussi" + ], + "ru": [ + "Библиография", + "Литература", + "Примечания", + "См. также", + "Ссылки" + ] + } +} diff --git a/src/bin/simplify_html.rs b/src/bin/simplify_html.rs index d24c7f5..54fae4e 100644 --- a/src/bin/simplify_html.rs +++ b/src/bin/simplify_html.rs @@ -10,7 +10,7 @@ fn main() -> anyhow::Result<()> { let mut input = String::new(); stdin().read_to_string(&mut input)?; - let output = simplify(&input); + let output = simplify(&input, "en"); stdout().write_all(output.as_bytes())?; diff --git a/src/html.rs b/src/html.rs index 9143021..d970eef 100644 --- a/src/html.rs +++ b/src/html.rs @@ -1,49 +1,63 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use once_cell::sync::Lazy; use scraper::{ElementRef, Html, Selector}; +use serde::Deserialize; -pub fn simplify(html: &str) -> String { - // TODO: handle multiple languages - let bad_sections = [ - "External links", - "Sources", - "See also", - "Bibliography", - "Further reading", - "References", - ]; +#[derive(Debug, Deserialize)] +struct Config<'a> { + #[serde(borrow)] + sections_to_remove: BTreeMap<&'a str, BTreeSet<&'a str>>, +} +static CONFIG: Lazy> = Lazy::new(|| { + serde_json::from_str(include_str!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/article_processing_config.json" + ))) + .expect("\"article_processing_config.json\" is either invalid json or the wrong structure") +}); + +static HEADERS: Lazy = + Lazy::new(|| Selector::parse("h1, h2, h3, h4, h5, h6, h7").unwrap()); + +pub fn simplify(html: &str, lang: &str) -> String { let mut document = Html::parse_document(html); - // TODO: evaluate this only once - let headers = Selector::parse("h1, h2, h3, h4, h5, h6, h7").unwrap(); - let mut to_remove = Vec::new(); // remove sections - for header in document.select(&headers) { - // TODO: should this join all text nodes? - let Some(title) = header.text().next() else { - continue - }; - if bad_sections.contains(&title) { - to_remove.push(header.id()); - let header_level = header.value().name(); - // strip trailing nodes - for sibling in header.next_siblings() { - if let Some(element) = sibling.value().as_element() { - if element.name() == header_level { - // TODO: should this check for a higher level? - break; + + if let Some(bad_sections) = CONFIG.sections_to_remove.get(lang) { + for header in document.select(&HEADERS) { + // TODO: should this join all text nodes? + let Some(title) = header.text().next() else { + continue + }; + + if bad_sections.contains(&title.trim()) { + to_remove.push(header.id()); + let header_level = header.value().name(); + // strip trailing nodes + for sibling in header.next_siblings() { + if let Some(element) = sibling.value().as_element() { + if element.name() == header_level { + // TODO: should this check for a higher level? + break; + } } + to_remove.push(sibling.id()); } - to_remove.push(sibling.id()); } } - } - for id in to_remove.drain(..) { - if let Some(mut node) = document.tree.get_mut(id) { - node.detach(); + for id in to_remove.drain(..) { + if let Some(mut node) = document.tree.get_mut(id) { + node.detach(); + } } + } else { + warn!("No sections to remove configured for lang {lang:?}"); } // remove elements with no text that isn't whitespace @@ -66,3 +80,13 @@ pub fn simplify(html: &str) -> String { document.html() } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn static_config_parses() { + assert!(!CONFIG.sections_to_remove.is_empty()); + } +} diff --git a/src/lib.rs b/src/lib.rs index 5648444..15063e9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,5 @@ pub mod html; pub mod wm; + +#[macro_use] +extern crate log; diff --git a/src/main.rs b/src/main.rs index 41f61b3..f085e3b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,9 +9,9 @@ // --wikipedia-urls /tmp/wikipedia_urls.txt \ // output_dir use std::{ - fs::File, + fs::{create_dir, File}, io::{stdin, BufRead, Write}, - path::PathBuf, + path::{Path, PathBuf}, }; use anyhow::bail; @@ -33,6 +33,37 @@ struct Args { wikipedia_urls: Option, } +fn write(dir: impl AsRef, page: Page) -> anyhow::Result<()> { + let Some(qid) = page.main_entity.map(|e| e.identifier) else { + // TODO: handle and still write + bail!("Page in list but without wikidata qid: {:?} ({})", page.name, page.url); + }; + + let mut filename = dir.as_ref().to_owned(); + filename.push(qid); + filename.push(&page.in_language.identifier); + filename.set_extension("html"); + + debug!("{:?}: {:?}", page.name, filename); + + if filename.exists() { + debug!("Exists, skipping"); + return Ok(()); + } + + let subfolder = filename.parent().unwrap(); + if !subfolder.exists() { + create_dir(subfolder)?; + } + + let html = simplify(&page.article_body.html, &page.in_language.identifier); + + let mut file = File::create(&filename)?; + file.write_all(html.as_bytes())?; + + Ok(()) +} + fn main() -> anyhow::Result<()> { env_logger::Builder::new() .filter_level(log::LevelFilter::Info) @@ -79,24 +110,9 @@ fn main() -> anyhow::Result<()> { continue; } - let Some(qid) = page.main_entity.map(|e| e.identifier) else { - warn!("Page in list but without wikidata qid: {:?}", page.name); - continue; - }; - - let filename = args.output_dir.join(qid).with_extension("html"); - - debug!("{:?}: {:?}", page.name, filename); - - if filename.exists() { - debug!("Exists, skipping"); - continue; + if let Err(e) = write(&args.output_dir, page) { + error!("Error writing article: {}", e); } - - let html = simplify(&page.article_body.html); - - let mut file = File::create(filename)?; - file.write_all(html.as_bytes())?; } Ok(()) diff --git a/src/wm/mod.rs b/src/wm/mod.rs index 00f433c..4e8cf02 100644 --- a/src/wm/mod.rs +++ b/src/wm/mod.rs @@ -58,15 +58,14 @@ pub fn is_wikipedia_match( titles: &HashSet, page: &Page, ) -> Option { - // TODO: handle multiple languages - let title = WikipediaTitleNorm::from_title(&page.name, "en"); + let title = WikipediaTitleNorm::from_title(&page.name, &page.in_language.identifier); if titles.get(&title).is_some() { return Some(title); } for redirect in &page.redirects { - let title = WikipediaTitleNorm::from_title(&redirect.name, "en"); + let title = WikipediaTitleNorm::from_title(&redirect.name, &page.in_language.identifier); if titles.get(&title).is_some() { return Some(title); diff --git a/src/wm/page.rs b/src/wm/page.rs index b830fd9..c118b5a 100644 --- a/src/wm/page.rs +++ b/src/wm/page.rs @@ -1,5 +1,6 @@ use serde::Deserialize; +// TODO: consolidate into single struct /// Deserialized Wikimedia Enterprise API Article /// /// For all available fields, see https://enterprise.wikimedia.com/docs/data-dictionary/ @@ -9,6 +10,7 @@ pub struct Page { // TODO: check if CoW has a performance impact pub name: String, pub date_modified: String, + pub in_language: Language, #[serde(default)] pub url: String, pub main_entity: Option, @@ -25,6 +27,8 @@ pub struct Wikidata { #[derive(Deserialize)] pub struct ArticleBody { + // TODO: look into RawValue to lazily parse/allocate this: + // https://docs.rs/serde_json/latest/serde_json/value/struct.RawValue.html pub html: String, } @@ -34,3 +38,8 @@ pub struct Redirect { pub url: String, pub name: String, } + +#[derive(Deserialize)] +pub struct Language { + pub identifier: String, +}