Add support for multiple languages

Per-language section removal is configured with a static json file. This includes a test to make sure the file exists and is formatted correctly. Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
2023-06-07 15:55:18 -04:00 · 2023-06-07 15:55:18 -04:00 · 8435682ddf
commit 8435682ddf
parent 35faadc693
10 changed files with 159 additions and 57 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -524,6 +524,7 @@ dependencies = [
 "clap",
 "env_logger",
 "log",
+ "once_cell",
 "scraper",
 "serde",
 "serde_json",
@ -533,9 +534,9 @@ dependencies = [

 [[package]]
 name = "once_cell"
-version = "1.17.2"
+version = "1.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9670a07f94779e00908f3e686eab508878ebb390ba6e604d3a284c00e8d0487b"
+checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"

 [[package]]
 name = "parking_lot"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -12,6 +12,7 @@ anyhow = { version = "1.0.71", features = ["backtrace"] }
 clap = { version = "4.3.2", features = ["derive"] }
 env_logger = "0.10.0"
 log = "0.4.18"
+once_cell = "1.18.0"
 scraper = "0.16.0"
 serde = { version = "1.0.163", features = ["derive"] }
 serde_json = "1.0.96"
--- a/README.md
+++ b/README.md
@ -1,3 +1,8 @@
 # wikiparser

 _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
+
+## Usage
+
+[`article_processing_config.json`](article_processing_config.json) should be updated when adding a new language.
+It defines article sections that are not important for users and should be removed.
--- a/article_processing_config.json
+++ b/article_processing_config.json
@ -0,0 +1,44 @@
+{
+  "sections_to_remove": {
+    "de": [
+      "Anmerkungen",
+      "Anmerkungen und Einzelnachweise",
+      "Einzelbelege",
+      "Einzelnachweise",
+      "Filme",
+      "Literatur",
+      "Siehe auch",
+      "Weblinks"
+    ],
+    "en": [
+      "Bibliography",
+      "External links",
+      "Further reading",
+      "References",
+      "See also",
+      "Sources"
+    ],
+    "es": [
+      "Enlaces externos",
+      "Referencias",
+      "Véase también",
+      "Vínculos de interés"
+    ],
+    "fr": [
+      "Articles connexes",
+      "Bibliographie",
+      "Lien externe",
+      "Liens externes",
+      "Notes et références",
+      "Références",
+      "Voir aussi"
+    ],
+    "ru": [
+      "Библиография",
+      "Литература",
+      "Примечания",
+      "См. также",
+      "Ссылки"
+    ]
+  }
+}
--- a/src/bin/simplify_html.rs
+++ b/src/bin/simplify_html.rs
@ -10,7 +10,7 @@ fn main() -> anyhow::Result<()> {
    let mut input = String::new();
    stdin().read_to_string(&mut input)?;

-    let output = simplify(&input);
+    let output = simplify(&input, "en");

    stdout().write_all(output.as_bytes())?;

--- a/src/html.rs
+++ b/src/html.rs
@ -1,49 +1,63 @@
+use std::collections::{BTreeMap, BTreeSet};
+
+use once_cell::sync::Lazy;
 use scraper::{ElementRef, Html, Selector};
+use serde::Deserialize;

-pub fn simplify(html: &str) -> String {
-    // TODO: handle multiple languages
-    let bad_sections = [
-        "External links",
-        "Sources",
-        "See also",
-        "Bibliography",
-        "Further reading",
-        "References",
-    ];
+#[derive(Debug, Deserialize)]
+struct Config<'a> {
+    #[serde(borrow)]
+    sections_to_remove: BTreeMap<&'a str, BTreeSet<&'a str>>,
+}

+static CONFIG: Lazy<Config<'static>> = Lazy::new(|| {
+    serde_json::from_str(include_str!(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/article_processing_config.json"
+    )))
+    .expect("\"article_processing_config.json\" is either invalid json or the wrong structure")
+});
+
+static HEADERS: Lazy<Selector> =
+    Lazy::new(|| Selector::parse("h1, h2, h3, h4, h5, h6, h7").unwrap());
+
+pub fn simplify(html: &str, lang: &str) -> String {
    let mut document = Html::parse_document(html);

-    // TODO: evaluate this only once
-    let headers = Selector::parse("h1, h2, h3, h4, h5, h6, h7").unwrap();
-
    let mut to_remove = Vec::new();

    // remove sections
-    for header in document.select(&headers) {
-        // TODO: should this join all text nodes?
-        let Some(title) = header.text().next() else {
-            continue
-        };
-        if bad_sections.contains(&title) {
-            to_remove.push(header.id());
-            let header_level = header.value().name();
-            // strip trailing nodes
-            for sibling in header.next_siblings() {
-                if let Some(element) = sibling.value().as_element() {
-                    if element.name() == header_level {
-                        // TODO: should this check for a higher level?
-                        break;
+
+    if let Some(bad_sections) = CONFIG.sections_to_remove.get(lang) {
+        for header in document.select(&HEADERS) {
+            // TODO: should this join all text nodes?
+            let Some(title) = header.text().next() else {
+                continue
+            };
+
+            if bad_sections.contains(&title.trim()) {
+                to_remove.push(header.id());
+                let header_level = header.value().name();
+                // strip trailing nodes
+                for sibling in header.next_siblings() {
+                    if let Some(element) = sibling.value().as_element() {
+                        if element.name() == header_level {
+                            // TODO: should this check for a higher level?
+                            break;
+                        }
                    }
+                    to_remove.push(sibling.id());
                }
-                to_remove.push(sibling.id());
            }
        }
-    }

-    for id in to_remove.drain(..) {
-        if let Some(mut node) = document.tree.get_mut(id) {
-            node.detach();
+        for id in to_remove.drain(..) {
+            if let Some(mut node) = document.tree.get_mut(id) {
+                node.detach();
+            }
        }
+    } else {
+        warn!("No sections to remove configured for lang {lang:?}");
    }

    // remove elements with no text that isn't whitespace
@ -66,3 +80,13 @@ pub fn simplify(html: &str) -> String {

    document.html()
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn static_config_parses() {
+        assert!(!CONFIG.sections_to_remove.is_empty());
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,2 +1,5 @@
 pub mod html;
 pub mod wm;
+
+#[macro_use]
+extern crate log;
--- a/src/main.rs
+++ b/src/main.rs
@ -9,9 +9,9 @@
 //     --wikipedia-urls /tmp/wikipedia_urls.txt \
 //     output_dir
 use std::{
-    fs::File,
+    fs::{create_dir, File},
    io::{stdin, BufRead, Write},
-    path::PathBuf,
+    path::{Path, PathBuf},
 };

 use anyhow::bail;
@ -33,6 +33,37 @@ struct Args {
    wikipedia_urls: Option<PathBuf>,
 }

+fn write(dir: impl AsRef<Path>, page: Page) -> anyhow::Result<()> {
+    let Some(qid) = page.main_entity.map(|e| e.identifier) else {
+        // TODO: handle and still write
+        bail!("Page in list but without wikidata qid: {:?} ({})", page.name, page.url);
+    };
+
+    let mut filename = dir.as_ref().to_owned();
+    filename.push(qid);
+    filename.push(&page.in_language.identifier);
+    filename.set_extension("html");
+
+    debug!("{:?}: {:?}", page.name, filename);
+
+    if filename.exists() {
+        debug!("Exists, skipping");
+        return Ok(());
+    }
+
+    let subfolder = filename.parent().unwrap();
+    if !subfolder.exists() {
+        create_dir(subfolder)?;
+    }
+
+    let html = simplify(&page.article_body.html, &page.in_language.identifier);
+
+    let mut file = File::create(&filename)?;
+    file.write_all(html.as_bytes())?;
+
+    Ok(())
+}
+
 fn main() -> anyhow::Result<()> {
    env_logger::Builder::new()
        .filter_level(log::LevelFilter::Info)
@ -79,24 +110,9 @@ fn main() -> anyhow::Result<()> {
            continue;
        }

-        let Some(qid) = page.main_entity.map(|e| e.identifier) else {
-            warn!("Page in list but without wikidata qid: {:?}", page.name);
-            continue;
-        };
-
-        let filename = args.output_dir.join(qid).with_extension("html");
-
-        debug!("{:?}: {:?}", page.name, filename);
-
-        if filename.exists() {
-            debug!("Exists, skipping");
-            continue;
+        if let Err(e) = write(&args.output_dir, page) {
+            error!("Error writing article: {}", e);
        }
-
-        let html = simplify(&page.article_body.html);
-
-        let mut file = File::create(filename)?;
-        file.write_all(html.as_bytes())?;
    }

    Ok(())
--- a/src/wm/mod.rs
+++ b/src/wm/mod.rs
@ -58,15 +58,14 @@ pub fn is_wikipedia_match(
    titles: &HashSet<WikipediaTitleNorm>,
    page: &Page,
 ) -> Option<WikipediaTitleNorm> {
-    // TODO: handle multiple languages
-    let title = WikipediaTitleNorm::from_title(&page.name, "en");
+    let title = WikipediaTitleNorm::from_title(&page.name, &page.in_language.identifier);

    if titles.get(&title).is_some() {
        return Some(title);
    }

    for redirect in &page.redirects {
-        let title = WikipediaTitleNorm::from_title(&redirect.name, "en");
+        let title = WikipediaTitleNorm::from_title(&redirect.name, &page.in_language.identifier);

        if titles.get(&title).is_some() {
            return Some(title);
--- a/src/wm/page.rs
+++ b/src/wm/page.rs
@ -1,5 +1,6 @@
 use serde::Deserialize;

+// TODO: consolidate into single struct
 /// Deserialized Wikimedia Enterprise API Article
 ///
 /// For all available fields, see https://enterprise.wikimedia.com/docs/data-dictionary/
@ -9,6 +10,7 @@ pub struct Page {
    // TODO: check if CoW has a performance impact
    pub name: String,
    pub date_modified: String,
+    pub in_language: Language,
    #[serde(default)]
    pub url: String,
    pub main_entity: Option<Wikidata>,
@ -25,6 +27,8 @@ pub struct Wikidata {

 #[derive(Deserialize)]
 pub struct ArticleBody {
+    // TODO: look into RawValue to lazily parse/allocate this:
+    // https://docs.rs/serde_json/latest/serde_json/value/struct.RawValue.html
    pub html: String,
 }

@ -34,3 +38,8 @@ pub struct Redirect {
    pub url: String,
    pub name: String,
 }
+
+#[derive(Deserialize)]
+pub struct Language {
+    pub identifier: String,
+}