Write to generator-compatible folder structure (#6)

The map generator expects a certain folder structure created by the current scraper to add the article content into the mwm files. - Article html is written to wikidata directory. - Directories are created for any matched titles and symlinked to the wikidata directory. - Articles without a QID are written to article title directory. - Article titles containing `/` are not escaped, so multiple subdirectories are possible. The output folder hierarchy looks like this: . ├── de.wikipedia.org │ └── wiki │ ├── Coal_River_Springs_Territorial_Park │ │ ├── de.html │ │ └── ru.html │ ├── Ni'iinlii_Njik_(Fishing_Branch)_Territorial_Park │ │ ├── de.html │ │ └── en.html │ ... ├── en.wikipedia.org │ └── wiki │ ├── Arctic_National_Wildlife_Refuge │ │ ├── de.html │ │ ├── en.html │ │ ├── es.html │ │ ├── fr.html │ │ └── ru.html │ ├── Baltimore │ │ └── Washington_International_Airport │ │ ├── de.html │ │ ├── en.html │ │ ├── es.html │ │ ├── fr.html │ │ └── ru.html │ ... └── wikidata ├── Q59320 │ ├── de.html │ ├── en.html │ ├── es.html │ ├── fr.html │ └── ru.html ├── Q120306 │ ├── de.html │ ├── en.html │ ├── es.html │ ├── fr.html │ └── ru.html ... Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
2023-07-10 10:34:20 -04:00 · 2023-07-10 10:34:20 -04:00 · 9036e3413f
commit 9036e3413f
parent bb1f897cd2
4 changed files with 233 additions and 95 deletions
--- a/README.md
+++ b/README.md
@ -2,7 +2,45 @@

 _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._

-## Usage
+## Configuring

 [`article_processing_config.json`](article_processing_config.json) should be updated when adding a new language.
-It defines article sections that are not important for users and should be removed.
+It defines article sections that are not important for users and should be removed from the extracted HTML.
+
+## Usage
+
+First, install [the rust language tools](https://www.rust-lang.org/)
+
+For best performance, use `--release` when building or running.
+
+You can run the program from within this directory using `cargo run --release --`.
+
+Alternatively, build it with `cargo build --release`, which places the binary in `./target/release/om-wikiparser`.
+
+Run the program with the `--help` flag to see all supported arguments.
+
+It takes as inputs:
+- A wikidata enterprise JSON dump, extracted and connected to `stdin`.
+- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
+- A file of Wikipedia article titles to extract, one per line (e.g. `https://$LANG.wikipedia.org/wiki/$ARTICLE_TITLE`), passed as a CLI flag `--wikipedia-urls`.
+- A directory to write the extracted articles to, as a CLI argument.
+
+As an example of usage with the map generator:
+- Assuming this program is installed to `$PATH` as `om-wikiparser`.
+- Download [the dumps in the desired languages](https://dumps.wikimedia.org/other/enterprise_html/runs/) (Use the files with the format `${LANG}wiki-NS0-${DATE}-ENTERPRISE-HTML.json.tar.gz`).
+  Set `DUMP_DOWNLOAD_DIR` to the location they are downloaded.
+- Run the following from within the `intermediate_data` subdirectory of the maps build directory:
+
+```shell
+# Transform intermediate files from generator.
+cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
+tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
+# Begin extraction.
+for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
+do
+  tar xzf $dump | om-wikiparser \
+    --wikidata-ids wikidata_ids.txt \
+    --wikipedia-urls wikipedia_urls.txt \
+    descriptions/
+done
+```
--- a/src/main.rs
+++ b/src/main.rs
@ -1,27 +1,18 @@
-// Usage:
-//     # prep outputs from map generator
-//     cut -f 2 ~/Downloads/id_to_wikidata.csv > /tmp/wikidata_ids.txt
-//     tail -n +2 ~/Downloads/wiki_urls.txt | cut -f 3 > /tmp/wikipedia_urls.txt
-//     # feed gzipped tarfile
-//     pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO \
-//     | cargo run --release -- \
-//     --wikidata-ids /tmp/wikidata_ids.txt \
-//     --wikipedia-urls /tmp/wikipedia_urls.txt \
-//     output_dir
 use std::{
-    fs::{create_dir, File},
+    fs::{self, File},
    io::{stdin, BufRead, Write},
+    os::unix,
    path::{Path, PathBuf},
 };

-use anyhow::bail;
+use anyhow::{anyhow, bail, Context};
 use clap::Parser;
 #[macro_use]
 extern crate log;

 use om_wikiparser::{
    html::simplify,
-    wm::{is_wikidata_match, is_wikipedia_match, parse_wikidata_file, parse_wikipedia_file, Page},
+    wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
 };

 #[derive(Parser)]
@ -33,33 +24,115 @@ struct Args {
    wikipedia_urls: Option<PathBuf>,
 }

-fn write(dir: impl AsRef<Path>, page: Page) -> anyhow::Result<()> {
-    let Some(qid) = page.main_entity.map(|e| e.identifier) else {
-        // TODO: handle and still write
-        bail!("Page in list but without wikidata qid: {:?} ({})", page.name, page.url);
+/// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
+fn create_article_dir(
+    base: impl AsRef<Path>,
+    page: &Page,
+    redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
+) -> anyhow::Result<PathBuf> {
+    let base = base.as_ref();
+    let mut redirects = redirects.into_iter();
+
+    let main_dir = match page.wikidata() {
+        None => {
+            // Write to wikipedia title directory.
+            // Prefer first redirect, fall back to page title if none exist
+            info!("Page without wikidata qid: {:?} ({})", page.name, page.url);
+            redirects
+                .next()
+                .or_else(|| match page.title() {
+                    Ok(title) => Some(title),
+                    Err(e) => {
+                        warn!("Unable to parse title for page {:?}: {:#}", page.name, e);
+                        None
+                    }
+                })
+                // hard fail when no titles can be parsed
+                .ok_or_else(|| anyhow!("No available titles for page {:?}", page.name))?
+                .get_dir(base.to_owned())
+        }
+        Some(qid) => {
+            // Otherwise use wikidata as main directory and symlink from wikipedia titles.
+            qid.get_dir(base.to_owned())
+        }
    };

-    let mut filename = dir.as_ref().to_owned();
-    filename.push(qid);
+    if main_dir.is_symlink() {
+        fs::remove_file(&main_dir)
+            .with_context(|| format!("removing old link for main directory {:?}", &main_dir))?;
+    }
+    fs::create_dir_all(&main_dir)
+        .with_context(|| format!("creating main directory {:?}", &main_dir))?;
+
+    // Write symlinks to main directory.
+    // TODO: Only write redirects that we care about.
+    for title in redirects {
+        let wikipedia_dir = title.get_dir(base.to_owned());
+
+        // Build required directory.
+        //
+        // Possible states from previous run:
+        // - Does not exist (and is not a symlink)
+        // - Exists, is a directory
+        // - Exists, is a valid symlink to correct location
+        // - Exists, is a valid symlink to incorrect location
+        if wikipedia_dir.exists() {
+            if wikipedia_dir.is_symlink() {
+                // Only replace if not valid
+                if fs::read_link(&wikipedia_dir)? == main_dir {
+                    continue;
+                }
+                fs::remove_file(&wikipedia_dir)?;
+            } else {
+                fs::remove_dir_all(&wikipedia_dir)?;
+            }
+        } else {
+            // titles can contain `/`, so ensure necessary subdirs exist
+            let parent_dir = wikipedia_dir.parent().unwrap();
+            fs::create_dir_all(parent_dir)
+                .with_context(|| format!("creating wikipedia directory {:?}", parent_dir))?;
+        }
+
+        unix::fs::symlink(&main_dir, &wikipedia_dir).with_context(|| {
+            format!(
+                "creating symlink from {:?} to {:?}",
+                wikipedia_dir, main_dir
+            )
+        })?;
+    }
+
+    Ok(main_dir)
+}
+
+/// Write selected article to disk.
+///
+/// - Write page contents to wikidata page (`wikidata.org/wiki/QXXX/lang.html`).
+/// - If the page has no wikidata qid, write contents to wikipedia location (`lang.wikipedia.org/wiki/article_title/lang.html`).
+/// - Create links from all wikipedia urls and redirects (`lang.wikipedia.org/wiki/a_redirect -> wikidata.org/wiki/QXXX`).
+fn write(
+    base: impl AsRef<Path>,
+    page: &Page,
+    redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
+) -> anyhow::Result<()> {
+    let article_dir = create_article_dir(base, page, redirects)?;
+
+    // Write html to determined file.
+    let mut filename = article_dir;
    filename.push(&page.in_language.identifier);
    filename.set_extension("html");

    debug!("{:?}: {:?}", page.name, filename);

    if filename.exists() {
-        debug!("Exists, skipping");
-        return Ok(());
-    }
-
-    let subfolder = filename.parent().unwrap();
-    if !subfolder.exists() {
-        create_dir(subfolder)?;
+        debug!("Overwriting existing file");
    }

    let html = simplify(&page.article_body.html, &page.in_language.identifier);

-    let mut file = File::create(&filename)?;
-    file.write_all(html.as_bytes())?;
+    let mut file =
+        File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;
+    file.write_all(html.as_bytes())
+        .with_context(|| format!("writing html file {:?}", filename))?;

    Ok(())
 }
@ -104,14 +177,28 @@ fn main() -> anyhow::Result<()> {
    for page in stream {
        let page = page?;

-        if !(is_wikidata_match(&wikidata_ids, &page).is_some()
-            || is_wikipedia_match(&wikipedia_titles, &page).is_some())
-        {
+        let is_wikidata_match = page
+            .wikidata()
+            .map(|qid| wikidata_ids.contains(&qid))
+            .unwrap_or_default();
+
+        let matching_titles = page
+            .all_titles()
+            .filter_map(|r| {
+                r.map(Some).unwrap_or_else(|e| {
+                    warn!("Could not parse title for {:?}: {:#}", &page.name, e);
+                    None
+                })
+            })
+            .filter(|t| wikipedia_titles.contains(t))
+            .collect::<Vec<_>>();
+
+        if !is_wikidata_match && matching_titles.is_empty() {
            continue;
        }

-        if let Err(e) = write(&args.output_dir, page) {
-            error!("Error writing article: {}", e);
+        if let Err(e) = write(&args.output_dir, &page, matching_titles) {
+            error!("Error writing article {:?}: {:#}", page.name, e);
        }
    }

--- a/src/wm/mod.rs
+++ b/src/wm/mod.rs
@ -1,5 +1,8 @@
 //! Wikimedia types
-use std::{collections::HashSet, ffi::OsStr, fs, num::ParseIntError, str::FromStr};
+use std::{
+    collections::HashSet, ffi::OsStr, fmt::Display, fs, num::ParseIntError, path::PathBuf,
+    str::FromStr,
+};

 use anyhow::{anyhow, bail, Context};

@ -40,53 +43,6 @@ pub fn parse_wikipedia_file(
        .collect()
 }

-pub fn is_wikidata_match(ids: &HashSet<WikidataQid>, page: &Page) -> Option<WikidataQid> {
-    let Some(wikidata) = &page.main_entity else { return None;};
-    let wikidata_id = &wikidata.identifier;
-    let wikidata_id = match WikidataQid::from_str(wikidata_id) {
-        Ok(qid) => qid,
-        Err(e) => {
-            warn!(
-                "Could not parse QID for {:?}: {:?}: {:#}",
-                page.name, wikidata_id, e
-            );
-            return None;
-        }
-    };
-
-    ids.get(&wikidata_id).map(|_| wikidata_id)
-}
-
-pub fn is_wikipedia_match(
-    titles: &HashSet<WikipediaTitleNorm>,
-    page: &Page,
-) -> Option<WikipediaTitleNorm> {
-    match WikipediaTitleNorm::from_title(&page.name, &page.in_language.identifier) {
-        Err(e) => warn!("Could not parse title for {:?}: {:#}", page.name, e),
-        Ok(title) => {
-            if titles.get(&title).is_some() {
-                return Some(title);
-            }
-        }
-    }
-
-    for redirect in &page.redirects {
-        match WikipediaTitleNorm::from_title(&redirect.name, &page.in_language.identifier) {
-            Err(e) => warn!(
-                "Could not parse redirect title for {:?}: {:?}: {:#}",
-                page.name, redirect.name, e
-            ),
-            Ok(title) => {
-                if titles.get(&title).is_some() {
-                    return Some(title);
-                }
-            }
-        }
-    }
-
-    None
-}
-
 /// Wikidata QID/Q Number
 ///
 /// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
@ -118,6 +74,23 @@ impl FromStr for WikidataQid {
    }
 }

+impl Display for WikidataQid {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Q{}", self.0)
+    }
+}
+
+impl WikidataQid {
+    pub fn get_dir(&self, base: PathBuf) -> PathBuf {
+        let mut path = base;
+        path.push("wikidata");
+        // TODO: can use as_mut_os_string with 1.70.0
+        path.push(self.to_string());
+
+        path
+    }
+}
+
 /// Normalized wikipedia article title that can compare:
 /// - titles `Spatial Database`
 /// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
@ -132,6 +105,11 @@ impl FromStr for WikidataQid {
 ///
 /// assert!(WikipediaTitleNorm::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
 /// assert!(WikipediaTitleNorm::from_url("https://wikidata.org/wiki/Q12345").is_err());
+///
+/// assert!(
+///     WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() !=
+///     WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil").unwrap()
+/// );
 /// ```
 #[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
 pub struct WikipediaTitleNorm {
@ -145,7 +123,7 @@ impl WikipediaTitleNorm {
        title.trim().replace(' ', "_")
    }

-    // https://en.wikipedia.org/wiki/Article_Title
+    // https://en.wikipedia.org/wiki/Article_Title/More_Title
    pub fn from_url(url: &str) -> anyhow::Result<Self> {
        let url = Url::parse(url.trim())?;

@ -159,21 +137,17 @@ impl WikipediaTitleNorm {
        }
        let lang = subdomain;

-        let mut paths = url
-            .path_segments()
-            .ok_or_else(|| anyhow!("Expected path"))?;
+        let path = url.path();

-        let root = paths
-            .next()
-            .ok_or_else(|| anyhow!("Expected first segment in path"))?;
+        let (root, title) = path
+            .strip_prefix('/')
+            .unwrap_or(path)
+            .split_once('/')
+            .ok_or_else(|| anyhow!("Expected at least two segments in path"))?;

        if root != "wiki" {
-            bail!("Expected 'wiki' in path")
+            bail!("Expected 'wiki' as root path, got: {:?}", root)
        }
-
-        let title = paths
-            .next()
-            .ok_or_else(|| anyhow!("Expected second segment in path"))?;
        let title = urlencoding::decode(title)?;

        Self::from_title(&title, lang)
@ -202,4 +176,14 @@ impl WikipediaTitleNorm {
        let lang = lang.to_owned();
        Ok(Self { name, lang })
    }
+
+    pub fn get_dir(&self, base: PathBuf) -> PathBuf {
+        let mut path = base;
+        // TODO: can use as_mut_os_string with 1.70.0
+        path.push(format!("{}.wikipedia.org", self.lang));
+        path.push("wiki");
+        path.push(&self.name);
+
+        path
+    }
 }
--- a/src/wm/page.rs
+++ b/src/wm/page.rs
@ -1,5 +1,9 @@
+use std::{iter, str::FromStr};
+
 use serde::Deserialize;

+use super::{WikidataQid, WikipediaTitleNorm};
+
 // TODO: consolidate into single struct
 /// Deserialized Wikimedia Enterprise API Article
 ///
@ -20,6 +24,31 @@ pub struct Page {
    pub redirects: Vec<Redirect>,
 }

+impl Page {
+    pub fn wikidata(&self) -> Option<WikidataQid> {
+        // TODO: return error
+        self.main_entity
+            .as_ref()
+            .map(|e| WikidataQid::from_str(&e.identifier).unwrap())
+    }
+
+    /// Title of the article
+    pub fn title(&self) -> anyhow::Result<WikipediaTitleNorm> {
+        WikipediaTitleNorm::from_title(&self.name, &self.in_language.identifier)
+    }
+
+    /// All titles that lead to the article, the main title followed by any redirects.
+    pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
+        iter::once(self.title()).chain(self.redirects())
+    }
+
+    pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
+        self.redirects
+            .iter()
+            .map(|r| WikipediaTitleNorm::from_title(&r.name, &self.in_language.identifier))
+    }
+}
+
 #[derive(Deserialize)]
 pub struct Wikidata {
    pub identifier: String,