2023-06-23 19:50:04 +00:00 · 2023-06-22 17:27:08 +00:00 · 2023-06-22 17:28:31 +00:00 · 2023-06-22 17:28:31 +00:00 · 2023-06-22 17:28:31 +00:00 · 2023-06-22 17:28:31 +00:00
11 changed files with 1653 additions and 34 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -4,10 +4,21 @@ version = "0.0.0"
 license = "AGPL-3.0-only"
 edition = "2021"
 repository = "https://github.com/organicmaps/wikiparser/"
-
+default-run = "om-wikiparser"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
 anyhow = { version = "1.0.71", features = ["backtrace"] }
+clap = { version = "4.3.2", features = ["derive"] }
+env_logger = "0.10.0"
+log = "0.4.18"
+once_cell = "1.18.0"
+scraper = "0.16.0"
 serde = { version = "1.0.163", features = ["derive"] }
 serde_json = "1.0.96"
+url = "2.3.1"
+urlencoding = "2.1.2"
+
+[profile.release]
+debug = true
+overflow-checks = true
--- a/README.md
+++ b/README.md
@ -1,3 +1,8 @@
 # wikiparser

 _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
+
+## Usage
+
+[`article_processing_config.json`](article_processing_config.json) should be updated when adding a new language.
+It defines article sections that are not important for users and should be removed.
--- a/article_processing_config.json
+++ b/article_processing_config.json
@ -0,0 +1,44 @@
+{
+  "sections_to_remove": {
+    "de": [
+      "Anmerkungen",
+      "Anmerkungen und Einzelnachweise",
+      "Einzelbelege",
+      "Einzelnachweise",
+      "Filme",
+      "Literatur",
+      "Siehe auch",
+      "Weblinks"
+    ],
+    "en": [
+      "Bibliography",
+      "External links",
+      "Further reading",
+      "References",
+      "See also",
+      "Sources"
+    ],
+    "es": [
+      "Enlaces externos",
+      "Referencias",
+      "Véase también",
+      "Vínculos de interés"
+    ],
+    "fr": [
+      "Articles connexes",
+      "Bibliographie",
+      "Lien externe",
+      "Liens externes",
+      "Notes et références",
+      "Références",
+      "Voir aussi"
+    ],
+    "ru": [
+      "Библиография",
+      "Литература",
+      "Примечания",
+      "См. также",
+      "Ссылки"
+    ]
+  }
+}
--- a/benches/id_parsing.rs
+++ b/benches/id_parsing.rs
@ -0,0 +1,43 @@
+#![feature(test)]
+use std::{collections::HashSet, str::FromStr};
+
+extern crate om_wikiparser;
+extern crate test;
+
+#[bench]
+fn parse_wikipedia(b: &mut test::Bencher) {
+    b.iter(|| {
+        let title = om_wikiparser::wm::WikipediaTitleNorm::from_url(
+            "https://en.wikipedia.org/wiki/Article_Title",
+        )
+        .unwrap();
+    });
+}
+
+#[bench]
+fn hash_wikipedia(b: &mut test::Bencher) {
+    let title = om_wikiparser::wm::WikipediaTitleNorm::from_url(
+        "https://en.wikipedia.org/wiki/Article_Title",
+    )
+    .unwrap();
+    let mut set = HashSet::new();
+    b.iter(|| {
+        set.insert(&title);
+    });
+}
+
+#[bench]
+fn parse_wikidata(b: &mut test::Bencher) {
+    b.iter(|| {
+        let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap();
+    });
+}
+
+#[bench]
+fn hash_wikidata(b: &mut test::Bencher) {
+    let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap();
+    let mut set = HashSet::new();
+    b.iter(|| {
+        set.insert(&qid);
+    });
+}
--- a/src/bin/simplify_html.rs
+++ b/src/bin/simplify_html.rs
@ -0,0 +1,18 @@
+//! Apply html article simplification to stdin, and write it to stdout.
+//!
+//! Usage:
+//!     simplify_html < article.html > simplified.html
+use std::io::{stdin, stdout, Read, Write};
+
+use om_wikiparser::html::simplify;
+
+fn main() -> anyhow::Result<()> {
+    let mut input = String::new();
+    stdin().read_to_string(&mut input)?;
+
+    let output = simplify(&input, "en");
+
+    stdout().write_all(output.as_bytes())?;
+
+    Ok(())
+}
--- a/src/html.rs
+++ b/src/html.rs
@ -0,0 +1,92 @@
+use std::collections::{BTreeMap, BTreeSet};
+
+use once_cell::sync::Lazy;
+use scraper::{ElementRef, Html, Selector};
+use serde::Deserialize;
+
+#[derive(Debug, Deserialize)]
+struct Config<'a> {
+    #[serde(borrow)]
+    sections_to_remove: BTreeMap<&'a str, BTreeSet<&'a str>>,
+}
+
+static CONFIG: Lazy<Config<'static>> = Lazy::new(|| {
+    serde_json::from_str(include_str!(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/article_processing_config.json"
+    )))
+    .expect("\"article_processing_config.json\" is either invalid json or the wrong structure")
+});
+
+static HEADERS: Lazy<Selector> =
+    Lazy::new(|| Selector::parse("h1, h2, h3, h4, h5, h6, h7").unwrap());
+
+pub fn simplify(html: &str, lang: &str) -> String {
+    let mut document = Html::parse_document(html);
+
+    let mut to_remove = Vec::new();
+
+    // Remove configured sections and all trailing elements until next section.
+
+    if let Some(bad_sections) = CONFIG.sections_to_remove.get(lang) {
+        for header in document.select(&HEADERS) {
+            // TODO: Should this join all text nodes?
+            let Some(title) = header.text().next() else {
+                continue
+            };
+
+            if bad_sections.contains(&title.trim()) {
+                to_remove.push(header.id());
+                let header_level = header.value().name();
+                // Strip trailing nodes.
+                for sibling in header.next_siblings() {
+                    if let Some(element) = sibling.value().as_element() {
+                        if element.name() == header_level {
+                            // TODO: Should this check for a higher level?
+                            break;
+                        }
+                    }
+                    to_remove.push(sibling.id());
+                }
+            }
+        }
+
+        for id in to_remove.drain(..) {
+            if let Some(mut node) = document.tree.get_mut(id) {
+                node.detach();
+            }
+        }
+    } else {
+        warn!("No sections to remove configured for lang {lang:?}");
+    }
+
+    // Remove elements with no text that isn't whitespace.
+
+    for element in document
+        .root_element()
+        .descendants()
+        .filter_map(ElementRef::wrap)
+    {
+        if element.text().all(|t| t.trim().is_empty()) {
+            to_remove.push(element.id());
+        }
+    }
+
+    for id in to_remove.drain(..) {
+        if let Some(mut node) = document.tree.get_mut(id) {
+            node.detach();
+        }
+    }
+
+    document.html()
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn static_config_parses() {
+        assert!(!CONFIG.sections_to_remove.is_empty());
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,5 @@
+pub mod html;
+pub mod wm;
+
+#[macro_use]
+extern crate log;
--- a/src/main.rs
+++ b/src/main.rs
@ -1,54 +1,118 @@
 // Usage:
-//     pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO | cargo run --release > /dev/null
+//     # prep outputs from map generator
+//     cut -f 2 ~/Downloads/id_to_wikidata.csv > /tmp/wikidata_ids.txt
+//     tail -n +2 ~/Downloads/wiki_urls.txt | cut -f 3 > /tmp/wikipedia_urls.txt
+//     # feed gzipped tarfile
+//     pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO \
+//     | cargo run --release -- \
+//     --wikidata-ids /tmp/wikidata_ids.txt \
+//     --wikipedia-urls /tmp/wikipedia_urls.txt \
+//     output_dir
+use std::{
+    fs::{create_dir, File},
+    io::{stdin, BufRead, Write},
+    path::{Path, PathBuf},
+};

-use serde::Deserialize;
-use std::io::{self, stdin, BufRead, BufReader, Write};
+use anyhow::bail;
+use clap::Parser;
+#[macro_use]
+extern crate log;

-#[derive(Deserialize)]
-struct Page {
-    // TODO: check if CoW has a performance impact
-    name: String,
-    date_modified: String,
-    #[serde(default)]
-    url: String,
-    main_entity: Option<Wikidata>,
-    // TODO: see what impact parsing/unescaping/allocating this has
-    article_body: ArticleBody,
-    #[serde(default)]
-    redirects: Vec<Redirect>,
+use om_wikiparser::{
+    html::simplify,
+    wm::{is_wikidata_match, is_wikipedia_match, parse_wikidata_file, parse_wikipedia_file, Page},
+};
+
+#[derive(Parser)]
+struct Args {
+    output_dir: PathBuf,
+    #[arg(long)]
+    wikidata_ids: Option<PathBuf>,
+    #[arg(long)]
+    wikipedia_urls: Option<PathBuf>,
 }

-#[derive(Deserialize)]
-struct Wikidata {
-    identifier: String,
-}
+fn write(dir: impl AsRef<Path>, page: Page) -> anyhow::Result<()> {
+    let Some(qid) = page.main_entity.map(|e| e.identifier) else {
+        // TODO: handle and still write
+        bail!("Page in list but without wikidata qid: {:?} ({})", page.name, page.url);
+    };

-#[derive(Deserialize)]
-struct ArticleBody {
-    html: String,
-}
+    let mut filename = dir.as_ref().to_owned();
+    filename.push(qid);
+    filename.push(&page.in_language.identifier);
+    filename.set_extension("html");

-#[derive(Deserialize)]
-struct Redirect {
-    url: String,
-    name: String,
+    debug!("{:?}: {:?}", page.name, filename);
+
+    if filename.exists() {
+        debug!("Exists, skipping");
+        return Ok(());
+    }
+
+    let subfolder = filename.parent().unwrap();
+    if !subfolder.exists() {
+        create_dir(subfolder)?;
+    }
+
+    let html = simplify(&page.article_body.html, &page.in_language.identifier);
+
+    let mut file = File::create(&filename)?;
+    file.write_all(html.as_bytes())?;
+
+    Ok(())
 }

 fn main() -> anyhow::Result<()> {
-    let dump = BufReader::new(stdin());
+    env_logger::Builder::new()
+        .filter_level(log::LevelFilter::Info)
+        .parse_default_env()
+        .try_init()?;

-    // TODO: compare different deserialization methods
-    // docs warn against using a reader directly, and it's slower than tar can decompress the dump
+    let args = Args::parse();
+
+    info!("Loading urls");
+    let wikipedia_titles = args
+        .wikipedia_urls
+        .map(parse_wikipedia_file)
+        .transpose()?
+        .unwrap_or_default();
+
+    info!("Loading ids");
+    let wikidata_ids = args
+        .wikidata_ids
+        .map(parse_wikidata_file)
+        .transpose()?
+        .unwrap_or_default();
+
+    if !args.output_dir.is_dir() {
+        bail!("output dir {:?} does not exist", args.output_dir)
+    }
+
+    info!("Processing dump");
+    let dump = stdin().lock();
+
+    // TODO: Compare different deserialization methods.
+    // The docs warn against using a reader directly, and it's slower than tar can decompress the dump.
    // let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
    let stream = dump.lines().map(|r| {
        r.map_err(anyhow::Error::new)
            .and_then(|s| serde_json::from_str::<Page>(&s).map_err(anyhow::Error::new))
    });

-    let mut stdout = io::stdout();
    for page in stream {
        let page = page?;
-        writeln!(stdout, "{}", page.name)?;
+
+        if !(is_wikidata_match(&wikidata_ids, &page).is_some()
+            || is_wikipedia_match(&wikipedia_titles, &page).is_some())
+        {
+            continue;
+        }
+
+        if let Err(e) = write(&args.output_dir, page) {
+            error!("Error writing article: {}", e);
+        }
    }

    Ok(())
--- a/src/wm/mod.rs
+++ b/src/wm/mod.rs
@ -0,0 +1,205 @@
+//! Wikimedia types
+use std::{collections::HashSet, ffi::OsStr, fs, num::ParseIntError, str::FromStr};
+
+use anyhow::{anyhow, bail, Context};
+
+use url::Url;
+
+mod page;
+pub use page::Page;
+
+/// Read from a file of urls on each line.
+pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<WikidataQid>> {
+    let contents = fs::read_to_string(path.as_ref())?;
+    contents
+        .lines()
+        .enumerate()
+        .map(|(i, line)| {
+            WikidataQid::from_str(line).with_context(|| {
+                let line_num = i + 1;
+                format!("bad QID value on line {line_num}: {line:?}")
+            })
+        })
+        .collect()
+}
+
+/// Read article titles from a file of urls on each line.
+pub fn parse_wikipedia_file(
+    path: impl AsRef<OsStr>,
+) -> anyhow::Result<HashSet<WikipediaTitleNorm>> {
+    let contents = fs::read_to_string(path.as_ref())?;
+    contents
+        .lines()
+        .enumerate()
+        .map(|(i, line)| {
+            WikipediaTitleNorm::from_url(line).with_context(|| {
+                let line_num = i + 1;
+                format!("bad wikipedia url on line {line_num}: {line:?}")
+            })
+        })
+        .collect()
+}
+
+pub fn is_wikidata_match(ids: &HashSet<WikidataQid>, page: &Page) -> Option<WikidataQid> {
+    let Some(wikidata) = &page.main_entity else { return None;};
+    let wikidata_id = &wikidata.identifier;
+    let wikidata_id = match WikidataQid::from_str(wikidata_id) {
+        Ok(qid) => qid,
+        Err(e) => {
+            warn!(
+                "Could not parse QID for {:?}: {:?}: {:#}",
+                page.name, wikidata_id, e
+            );
+            return None;
+        }
+    };
+
+    ids.get(&wikidata_id).map(|_| wikidata_id)
+}
+
+pub fn is_wikipedia_match(
+    titles: &HashSet<WikipediaTitleNorm>,
+    page: &Page,
+) -> Option<WikipediaTitleNorm> {
+    match WikipediaTitleNorm::from_title(&page.name, &page.in_language.identifier) {
+        Err(e) => warn!("Could not parse title for {:?}: {:#}", page.name, e),
+        Ok(title) => {
+            if titles.get(&title).is_some() {
+                return Some(title);
+            }
+        }
+    }
+
+    for redirect in &page.redirects {
+        match WikipediaTitleNorm::from_title(&redirect.name, &page.in_language.identifier) {
+            Err(e) => warn!(
+                "Could not parse redirect title for {:?}: {:?}: {:#}",
+                page.name, redirect.name, e
+            ),
+            Ok(title) => {
+                if titles.get(&title).is_some() {
+                    return Some(title);
+                }
+            }
+        }
+    }
+
+    None
+}
+
+/// Wikidata QID/Q Number
+///
+/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
+///
+/// ```
+/// use std::str::FromStr;
+/// use om_wikiparser::wm::WikidataQid;
+///
+/// let with_q = WikidataQid::from_str("Q12345").unwrap();
+/// let without_q = WikidataQid::from_str(" 12345 ").unwrap();
+/// assert_eq!(with_q, without_q);
+///
+/// assert!(WikidataQid::from_str("q12345").is_ok());
+/// assert!(WikidataQid::from_str("https://wikidata.org/wiki/Q12345").is_err());
+/// assert!(WikidataQid::from_str("Article_Title").is_err());
+/// assert!(WikidataQid::from_str("Q").is_err());
+/// assert!(WikidataQid::from_str("").is_err());
+/// ```
+#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
+pub struct WikidataQid(u32);
+
+impl FromStr for WikidataQid {
+    type Err = ParseIntError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let s = s.trim();
+        let s = s.strip_prefix(['Q', 'q']).unwrap_or(s);
+        u32::from_str(s).map(WikidataQid)
+    }
+}
+
+/// Normalized wikipedia article title that can compare:
+/// - titles `Spatial Database`
+/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
+/// - osm-style tags `en:Spatial Database`
+///
+/// ```
+/// use om_wikiparser::wm::WikipediaTitleNorm;
+///
+/// let title = WikipediaTitleNorm::from_title("Article Title", "en").unwrap();
+/// let url = WikipediaTitleNorm::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap();
+/// assert_eq!(url, title);
+///
+/// assert!(WikipediaTitleNorm::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
+/// assert!(WikipediaTitleNorm::from_url("https://wikidata.org/wiki/Q12345").is_err());
+/// ```
+#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
+pub struct WikipediaTitleNorm {
+    lang: String,
+    name: String,
+}
+
+impl WikipediaTitleNorm {
+    fn normalize_title(title: &str) -> String {
+        // TODO: Compare with map generator url creation, ensure covers all cases.
+        title.trim().replace(' ', "_")
+    }
+
+    // https://en.wikipedia.org/wiki/Article_Title
+    pub fn from_url(url: &str) -> anyhow::Result<Self> {
+        let url = Url::parse(url.trim())?;
+
+        let (subdomain, host) = url
+            .host_str()
+            .ok_or_else(|| anyhow!("Expected host"))?
+            .split_once('.')
+            .ok_or_else(|| anyhow!("Expected subdomain"))?;
+        if host != "wikipedia.org" {
+            bail!("Expected wikipedia.org for domain")
+        }
+        let lang = subdomain;
+
+        let mut paths = url
+            .path_segments()
+            .ok_or_else(|| anyhow!("Expected path"))?;
+
+        let root = paths
+            .next()
+            .ok_or_else(|| anyhow!("Expected first segment in path"))?;
+
+        if root != "wiki" {
+            bail!("Expected 'wiki' in path")
+        }
+
+        let title = paths
+            .next()
+            .ok_or_else(|| anyhow!("Expected second segment in path"))?;
+        let title = urlencoding::decode(title)?;
+
+        Self::from_title(&title, lang)
+    }
+
+    // en:Article Title
+    fn _from_osm_tag(tag: &str) -> anyhow::Result<Self> {
+        let (lang, title) = tag
+            .trim()
+            .split_once(':')
+            .ok_or_else(|| anyhow!("Expected ':'"))?;
+
+        Self::from_title(title, lang)
+    }
+
+    pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> {
+        let title = title.trim();
+        let lang = lang.trim();
+        if title.is_empty() {
+            bail!("title cannot be empty or whitespace");
+        }
+        if lang.is_empty() {
+            bail!("lang cannot be empty or whitespace");
+        }
+        let name = Self::normalize_title(title);
+        let lang = lang.to_owned();
+        Ok(Self { name, lang })
+    }
+}
--- a/src/wm/page.rs
+++ b/src/wm/page.rs
@ -0,0 +1,45 @@
+use serde::Deserialize;
+
+// TODO: consolidate into single struct
+/// Deserialized Wikimedia Enterprise API Article
+///
+/// For all available fields, see <https://enterprise.wikimedia.com/docs/data-dictionary/>.
+#[allow(dead_code)] // TODO: reevaluate fields
+#[derive(Deserialize)]
+pub struct Page {
+    // TODO: Check if CoW has a performance impact.
+    pub name: String,
+    pub date_modified: String,
+    pub in_language: Language,
+    #[serde(default)]
+    pub url: String,
+    pub main_entity: Option<Wikidata>,
+    // TODO: See what impact parsing/unescaping/allocating this has.
+    pub article_body: ArticleBody,
+    #[serde(default)]
+    pub redirects: Vec<Redirect>,
+}
+
+#[derive(Deserialize)]
+pub struct Wikidata {
+    pub identifier: String,
+}
+
+#[derive(Deserialize)]
+pub struct ArticleBody {
+    // TODO: Look into RawValue to lazily parse/allocate this:
+    // https://docs.rs/serde_json/latest/serde_json/value/struct.RawValue.html
+    pub html: String,
+}
+
+#[allow(dead_code)] // TODO: Reevaluate fields.
+#[derive(Deserialize)]
+pub struct Redirect {
+    pub url: String,
+    pub name: String,
+}
+
+#[derive(Deserialize)]
+pub struct Language {
+    pub identifier: String,
+}