From b6db70f74c6b4b37b6cba957ee630fbb1b0d9210 Mon Sep 17 00:00:00 2001
From: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
Date: Mon, 7 Aug 2023 17:40:32 -0400
Subject: [PATCH] Refactor into subcommands

- Use CLI subcommands (e.g. `om-wikiparser get-articles`)
- Move article processing into a separate module
- Convert simplify helper from separate binary to subcommand

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
---
 src/bin/simplify_html.rs |  23 ---
 src/get_articles.rs      | 263 ++++++++++++++++++++++++++++++++++
 src/main.rs              | 302 ++++++---------------------------------
 3 files changed, 308 insertions(+), 280 deletions(-)
 delete mode 100644 src/bin/simplify_html.rs
 create mode 100644 src/get_articles.rs
diff --git a/src/bin/simplify_html.rs b/src/bin/simplify_html.rs
deleted file mode 100644
index 6e66e9e..0000000
--- a/src/bin/simplify_html.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-//! Apply html article simplification to stdin, and write it to stdout.
-//!
-//! Usage:
-//!     simplify_html < article.html > simplified.html
-use std::io::{stdin, stdout, Read, Write};
-
-use om_wikiparser::html::simplify;
-
-fn main() -> anyhow::Result<()> {
-    env_logger::Builder::new()
-        .filter_level(log::LevelFilter::Info)
-        .parse_default_env()
-        .try_init()?;
-
-    let mut input = String::new();
-    stdin().read_to_string(&mut input)?;
-
-    let output = simplify(&input, "en");
-
-    stdout().write_all(output.as_bytes())?;
-
-    Ok(())
-}
diff --git a/src/get_articles.rs b/src/get_articles.rs
new file mode 100644
index 0000000..c4a342f
--- /dev/null
+++ b/src/get_articles.rs
@@ -0,0 +1,263 @@
+use std::{
+    fs::{self, File},
+    io::{stdin, BufRead, Write},
+    os::unix,
+    path::{Path, PathBuf},
+};
+
+use anyhow::{anyhow, bail, Context};
+
+use om_wikiparser::{
+    html::simplify,
+    wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
+};
+
+/// Extract article HTML from Wikipedia Enterprise HTML dumps.
+///
+/// Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
+#[derive(clap::Args)]
+pub struct Args {
+    /// Directory to write the extracted articles to.
+    pub output_dir: PathBuf,
+
+    /// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
+    ///
+    /// This can be generated with `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
+    #[arg(long, help_heading = "FILTERS")]
+    pub osm_tags: Option<PathBuf>,
+
+    /// Path to file that contains a Wikidata QID to extract on each line
+    /// (e.g. `Q12345`).
+    #[arg(long, help_heading = "FILTERS")]
+    pub wikidata_ids: Option<PathBuf>,
+
+    /// Path to file that contains a Wikipedia article url to extract on each line
+    /// (e.g. `https://lang.wikipedia.org/wiki/Article_Title`).
+    #[arg(long, help_heading = "FILTERS")]
+    pub wikipedia_urls: Option<PathBuf>,
+
+    /// Append to the provided file path the QIDs of articles matched by title but not QID.
+    ///
+    /// Use this to save the QIDs of articles you know the url of, but not the QID.
+    /// The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
+    /// Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
+    #[arg(long, requires("wikipedia_urls"))]
+    pub write_new_ids: Option<PathBuf>,
+}
+
+pub fn run(args: Args) -> anyhow::Result<()> {
+    let mut wikipedia_titles = if let Some(path) = args.wikipedia_urls {
+        info!("Loading article urls from {path:?}");
+        parse_wikipedia_file(path)?
+    } else {
+        Default::default()
+    };
+
+    let mut wikidata_ids = if let Some(path) = args.wikidata_ids {
+        info!("Loading wikidata ids from {path:?}");
+        parse_wikidata_file(path)?
+    } else {
+        Default::default()
+    };
+
+    if let Some(path) = args.osm_tags {
+        info!("Loading wikipedia/wikidata osm tags from {path:?}");
+        parse_osm_tag_file(path, &mut wikidata_ids, &mut wikipedia_titles)?;
+    }
+
+    debug!("Parsed {} unique article urls", wikipedia_titles.len());
+    debug!("Parsed {} unique wikidata ids", wikidata_ids.len());
+
+    // NOTE: For atomic writes to the same file across threads/processes:
+    // - The file needs to be opened in APPEND mode (`.append(true)`).
+    // - Each write needs to be a single syscall (for Rust, use `format!` for formatting before calling `write!`, or `write!` to a `String` first).
+    // - Each write needs to be under `PIPE_BUF` size (see `man write(3)`), usually 4kb on Linux.
+    //
+    // For more information, see:
+    // - `man write(3posix)`: https://www.man7.org/linux/man-pages/man3/write.3p.html
+    // - `std::fs::OpenOptions::append`: https://doc.rust-lang.org/std/fs/struct.OpenOptions.html#method.append
+    // - https://stackoverflow.com/questions/1154446/is-file-append-atomic-in-unix
+    let mut write_new_ids = args
+        .write_new_ids
+        .as_ref()
+        .map(|p| File::options().create(true).append(true).open(p))
+        .transpose()?;
+
+    if !args.output_dir.is_dir() {
+        bail!("output dir {:?} does not exist", args.output_dir)
+    }
+
+    info!("Processing dump");
+    let dump = stdin().lock();
+
+    // TODO: Compare different deserialization methods.
+    // The docs warn against using a reader directly, and it's slower than tar can decompress the dump.
+    // let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
+    let stream = dump.lines().map(|r| {
+        r.map_err(anyhow::Error::new)
+            .and_then(|s| serde_json::from_str::<Page>(&s).map_err(anyhow::Error::new))
+    });
+
+    for page in stream {
+        let page = page?;
+
+        let qid = page.wikidata();
+
+        let is_wikidata_match = qid
+            .as_ref()
+            .map(|qid| wikidata_ids.contains(qid))
+            .unwrap_or_default();
+
+        let matching_titles = if wikipedia_titles.is_empty() {
+            Default::default()
+        } else {
+            page.all_titles()
+                .filter_map(|r| {
+                    r.map(Some).unwrap_or_else(|e| {
+                        warn!("Could not parse title for {:?}: {:#}", &page.name, e);
+                        None
+                    })
+                })
+                .filter(|t| wikipedia_titles.contains(t))
+                .collect::<Vec<_>>()
+        };
+
+        if !is_wikidata_match && matching_titles.is_empty() {
+            continue;
+        }
+
+        // Write matched new QIDs back to file.
+        if let (Some(f), Some(qid)) = (&mut write_new_ids, &qid) {
+            if !is_wikidata_match && !matching_titles.is_empty() {
+                debug!("Writing new id {} for article {:?}", qid, page.name);
+                // NOTE: Write to string buffer first to have a single atomic write syscall.
+                // See `write_new_ids` for more info.
+                let line = format!("{}\n", qid);
+                write!(f, "{}", line).with_context(|| {
+                    format!(
+                        "writing new id to file {:?}",
+                        args.write_new_ids.as_ref().unwrap()
+                    )
+                })?;
+            }
+        }
+
+        if let Err(e) = write(&args.output_dir, &page, matching_titles) {
+            error!("Error writing article {:?}: {:#}", page.name, e);
+        }
+    }
+
+    Ok(())
+}
+
+/// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
+fn create_article_dir(
+    base: impl AsRef<Path>,
+    page: &Page,
+    redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
+) -> anyhow::Result<PathBuf> {
+    let base = base.as_ref();
+    let mut redirects = redirects.into_iter();
+
+    let main_dir = match page.wikidata() {
+        None => {
+            // Write to wikipedia title directory.
+            // Prefer first redirect, fall back to page title if none exist
+            info!("Page without wikidata qid: {:?} ({})", page.name, page.url);
+            redirects
+                .next()
+                .or_else(|| match page.title() {
+                    Ok(title) => Some(title),
+                    Err(e) => {
+                        warn!("Unable to parse title for page {:?}: {:#}", page.name, e);
+                        None
+                    }
+                })
+                // hard fail when no titles can be parsed
+                .ok_or_else(|| anyhow!("No available titles for page {:?}", page.name))?
+                .get_dir(base.to_owned())
+        }
+        Some(qid) => {
+            // Otherwise use wikidata as main directory and symlink from wikipedia titles.
+            qid.get_dir(base.to_owned())
+        }
+    };
+
+    if main_dir.is_symlink() {
+        fs::remove_file(&main_dir)
+            .with_context(|| format!("removing old link for main directory {:?}", &main_dir))?;
+    }
+    fs::create_dir_all(&main_dir)
+        .with_context(|| format!("creating main directory {:?}", &main_dir))?;
+
+    // Write symlinks to main directory.
+    for title in redirects {
+        let wikipedia_dir = title.get_dir(base.to_owned());
+
+        // Build required directory.
+        //
+        // Possible states from previous run:
+        // - Does not exist (and is not a symlink)
+        // - Exists, is a directory
+        // - Exists, is a valid symlink to correct location
+        // - Exists, is a valid symlink to incorrect location
+        if wikipedia_dir.exists() {
+            if wikipedia_dir.is_symlink() {
+                // Only replace if not valid
+                if fs::read_link(&wikipedia_dir)? == main_dir {
+                    continue;
+                }
+                fs::remove_file(&wikipedia_dir)?;
+            } else {
+                fs::remove_dir_all(&wikipedia_dir)?;
+            }
+        } else {
+            // titles can contain `/`, so ensure necessary subdirs exist
+            let parent_dir = wikipedia_dir.parent().unwrap();
+            fs::create_dir_all(parent_dir)
+                .with_context(|| format!("creating wikipedia directory {:?}", parent_dir))?;
+        }
+
+        unix::fs::symlink(&main_dir, &wikipedia_dir).with_context(|| {
+            format!(
+                "creating symlink from {:?} to {:?}",
+                wikipedia_dir, main_dir
+            )
+        })?;
+    }
+
+    Ok(main_dir)
+}
+
+/// Write selected article to disk.
+///
+/// - Write page contents to wikidata page (`wikidata.org/wiki/QXXX/lang.html`).
+/// - If the page has no wikidata qid, write contents to wikipedia location (`lang.wikipedia.org/wiki/article_title/lang.html`).
+/// - Create links from all wikipedia urls and redirects (`lang.wikipedia.org/wiki/a_redirect -> wikidata.org/wiki/QXXX`).
+fn write(
+    base: impl AsRef<Path>,
+    page: &Page,
+    redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
+) -> anyhow::Result<()> {
+    let article_dir = create_article_dir(base, page, redirects)?;
+
+    // Write html to determined file.
+    let mut filename = article_dir;
+    filename.push(&page.in_language.identifier);
+    filename.set_extension("html");
+
+    debug!("{:?}: {:?}", page.name, filename);
+
+    if filename.exists() {
+        debug!("Overwriting existing file");
+    }
+
+    let html = simplify(&page.article_body.html, &page.in_language.identifier);
+
+    let mut file =
+        File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;
+    file.write_all(html.as_bytes())
+        .with_context(|| format!("writing html file {:?}", filename))?;
+
+    Ok(())
+}
diff --git a/src/main.rs b/src/main.rs
index 1dc289b..4369eea 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,19 +1,8 @@
-use std::{
-    fs::{self, File},
-    io::{stdin, BufRead, Write},
-    os::unix,
-    path::{Path, PathBuf},
-};
-
-use anyhow::{anyhow, bail, Context};
-use clap::{CommandFactory, Parser};
+use clap::{CommandFactory, Parser, Subcommand};
 #[macro_use]
 extern crate log;
 
-use om_wikiparser::{
-    html::simplify,
-    wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
-};
+mod get_articles;
 
 /// Get the version returned by `git describe`, e.g.:
 /// - `v2.0` if a git tag
@@ -28,150 +17,30 @@ fn version() -> &'static str {
         .unwrap_or("unknown")
 }
 
-/// Extract article HTML from Wikipedia Enterprise HTML dumps.
-///
-/// Expects an uncompressed dump connected to stdin.
 #[derive(Parser)]
 #[command(version = crate::version())]
 struct Args {
-    /// Directory to write the extracted articles to.
-    output_dir: PathBuf,
-
-    /// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
-    ///
-    /// This can be generated with `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
-    #[arg(long, help_heading = "FILTERS")]
-    osm_tags: Option<PathBuf>,
-
-    /// Path to file that contains a Wikidata QID to extract on each line
-    /// (e.g. `Q12345`).
-    #[arg(long, help_heading = "FILTERS")]
-    wikidata_ids: Option<PathBuf>,
-
-    /// Path to file that contains a Wikipedia article url to extract on each line
-    /// (e.g. `https://lang.wikipedia.org/wiki/Article_Title`).
-    #[arg(long, help_heading = "FILTERS")]
-    wikipedia_urls: Option<PathBuf>,
-
-    /// Append to the provided file path the QIDs of articles matched by title but not QID.
-    ///
-    /// Use this to save the QIDs of articles you know the url of, but not the QID.
-    /// The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
-    /// Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
-    #[arg(long, requires("wikipedia_urls"))]
-    write_new_ids: Option<PathBuf>,
+    #[command(subcommand)]
+    cmd: Cmd,
 }
 
-/// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
-fn create_article_dir(
-    base: impl AsRef<Path>,
-    page: &Page,
-    redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
-) -> anyhow::Result<PathBuf> {
-    let base = base.as_ref();
-    let mut redirects = redirects.into_iter();
+#[derive(Subcommand)]
+enum Cmd {
+    GetArticles(get_articles::Args),
 
-    let main_dir = match page.wikidata() {
-        None => {
-            // Write to wikipedia title directory.
-            // Prefer first redirect, fall back to page title if none exist
-            info!("Page without wikidata qid: {:?} ({})", page.name, page.url);
-            redirects
-                .next()
-                .or_else(|| match page.title() {
-                    Ok(title) => Some(title),
-                    Err(e) => {
-                        warn!("Unable to parse title for page {:?}: {:#}", page.name, e);
-                        None
-                    }
-                })
-                // hard fail when no titles can be parsed
-                .ok_or_else(|| anyhow!("No available titles for page {:?}", page.name))?
-                .get_dir(base.to_owned())
-        }
-        Some(qid) => {
-            // Otherwise use wikidata as main directory and symlink from wikipedia titles.
-            qid.get_dir(base.to_owned())
-        }
-    };
+    /// Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump.
+    ///
+    /// Writes to stdout the extracted tags in a TSV format similar to `osmconvert --csv`.
+    GetTags,
 
-    if main_dir.is_symlink() {
-        fs::remove_file(&main_dir)
-            .with_context(|| format!("removing old link for main directory {:?}", &main_dir))?;
-    }
-    fs::create_dir_all(&main_dir)
-        .with_context(|| format!("creating main directory {:?}", &main_dir))?;
-
-    // Write symlinks to main directory.
-    for title in redirects {
-        let wikipedia_dir = title.get_dir(base.to_owned());
-
-        // Build required directory.
-        //
-        // Possible states from previous run:
-        // - Does not exist (and is not a symlink)
-        // - Exists, is a directory
-        // - Exists, is a valid symlink to correct location
-        // - Exists, is a valid symlink to incorrect location
-        if wikipedia_dir.exists() {
-            if wikipedia_dir.is_symlink() {
-                // Only replace if not valid
-                if fs::read_link(&wikipedia_dir)? == main_dir {
-                    continue;
-                }
-                fs::remove_file(&wikipedia_dir)?;
-            } else {
-                fs::remove_dir_all(&wikipedia_dir)?;
-            }
-        } else {
-            // titles can contain `/`, so ensure necessary subdirs exist
-            let parent_dir = wikipedia_dir.parent().unwrap();
-            fs::create_dir_all(parent_dir)
-                .with_context(|| format!("creating wikipedia directory {:?}", parent_dir))?;
-        }
-
-        unix::fs::symlink(&main_dir, &wikipedia_dir).with_context(|| {
-            format!(
-                "creating symlink from {:?} to {:?}",
-                wikipedia_dir, main_dir
-            )
-        })?;
-    }
-
-    Ok(main_dir)
-}
-
-/// Write selected article to disk.
-///
-/// - Write page contents to wikidata page (`wikidata.org/wiki/QXXX/lang.html`).
-/// - If the page has no wikidata qid, write contents to wikipedia location (`lang.wikipedia.org/wiki/article_title/lang.html`).
-/// - Create links from all wikipedia urls and redirects (`lang.wikipedia.org/wiki/a_redirect -> wikidata.org/wiki/QXXX`).
-fn write(
-    base: impl AsRef<Path>,
-    page: &Page,
-    redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
-) -> anyhow::Result<()> {
-    let article_dir = create_article_dir(base, page, redirects)?;
-
-    // Write html to determined file.
-    let mut filename = article_dir;
-    filename.push(&page.in_language.identifier);
-    filename.set_extension("html");
-
-    debug!("{:?}: {:?}", page.name, filename);
-
-    if filename.exists() {
-        debug!("Overwriting existing file");
-    }
-
-    let html = simplify(&page.article_body.html, &page.in_language.identifier);
-
-    let mut file =
-        File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;
-    file.write_all(html.as_bytes())
-        .with_context(|| format!("writing html file {:?}", filename))?;
-
-    Ok(())
+    /// Apply the same html article simplification used when extracting articles to stdin, and write it to stdout.
+    ///
+    /// This is meant for testing and debugging.
+    Simplify {
+        /// The language to use when processing the article (defaults to `en`).
+        #[arg(long, default_value_t = String::from("en"))]
+        lang: String,
+    },
 }
 
 fn main() -> anyhow::Result<()> {
@@ -184,117 +53,36 @@ fn main() -> anyhow::Result<()> {
 
     let args = Args::parse();
 
-    if args.wikidata_ids.is_none() && args.wikipedia_urls.is_none() && args.osm_tags.is_none() {
-        let mut cmd = Args::command();
-        cmd.error(
-            clap::error::ErrorKind::MissingRequiredArgument,
-            "at least one --osm-tags --wikidata-ids --wikipedia-urls is required",
-        )
-        .exit()
-    }
-
     info!("{} {}", Args::command().get_name(), version());
 
-    let mut wikipedia_titles = if let Some(path) = args.wikipedia_urls {
-        info!("Loading article urls from {path:?}");
-        parse_wikipedia_file(path)?
-    } else {
-        Default::default()
-    };
-
-    let mut wikidata_ids = if let Some(path) = args.wikidata_ids {
-        info!("Loading wikidata ids from {path:?}");
-        parse_wikidata_file(path)?
-    } else {
-        Default::default()
-    };
-
-    if let Some(path) = args.osm_tags {
-        info!("Loading wikipedia/wikidata osm tags from {path:?}");
-        parse_osm_tag_file(path, &mut wikidata_ids, &mut wikipedia_titles)?;
-    }
-
-    debug!("Parsed {} unique article urls", wikipedia_titles.len());
-    debug!("Parsed {} unique wikidata ids", wikidata_ids.len());
-
-    // NOTE: For atomic writes to the same file across threads/processes:
-    // - The file needs to be opened in APPEND mode (`.append(true)`).
-    // - Each write needs to be a single syscall (for Rust, use `format!` for formatting before calling `write!`, or `write!` to a `String` first).
-    // - Each write needs to be under `PIPE_BUF` size (see `man write(3)`), usually 4kb on Linux.
-    //
-    // For more information, see:
-    // - `man write(3posix)`: https://www.man7.org/linux/man-pages/man3/write.3p.html
-    // - `std::fs::OpenOptions::append`: https://doc.rust-lang.org/std/fs/struct.OpenOptions.html#method.append
-    // - https://stackoverflow.com/questions/1154446/is-file-append-atomic-in-unix
-    let mut write_new_ids = args
-        .write_new_ids
-        .as_ref()
-        .map(|p| File::options().create(true).append(true).open(p))
-        .transpose()?;
-
-    if !args.output_dir.is_dir() {
-        bail!("output dir {:?} does not exist", args.output_dir)
-    }
-
-    info!("Processing dump");
-    let dump = stdin().lock();
-
-    // TODO: Compare different deserialization methods.
-    // The docs warn against using a reader directly, and it's slower than tar can decompress the dump.
-    // let stream = serde_json::Deserializer::from_reader(dump).into_iter::<Page>();
-    let stream = dump.lines().map(|r| {
-        r.map_err(anyhow::Error::new)
-            .and_then(|s| serde_json::from_str::<Page>(&s).map_err(anyhow::Error::new))
-    });
-
-    for page in stream {
-        let page = page?;
-
-        let qid = page.wikidata();
-
-        let is_wikidata_match = qid
-            .as_ref()
-            .map(|qid| wikidata_ids.contains(qid))
-            .unwrap_or_default();
-
-        let matching_titles = if wikipedia_titles.is_empty() {
-            Default::default()
-        } else {
-            page.all_titles()
-                .filter_map(|r| {
-                    r.map(Some).unwrap_or_else(|e| {
-                        warn!("Could not parse title for {:?}: {:#}", &page.name, e);
-                        None
-                    })
-                })
-                .filter(|t| wikipedia_titles.contains(t))
-                .collect::<Vec<_>>()
-        };
-
-        if !is_wikidata_match && matching_titles.is_empty() {
-            continue;
-        }
-
-        // Write matched new QIDs back to fild.
-        if let (Some(f), Some(qid)) = (&mut write_new_ids, &qid) {
-            if !is_wikidata_match && !matching_titles.is_empty() {
-                debug!("Writing new id {} for article {:?}", qid, page.name);
-                // NOTE: Write to string buffer first to have a single atomic write syscall.
-                // See `write_new_ids` for more info.
-                let line = format!("{}\n", qid);
-                write!(f, "{}", line).with_context(|| {
-                    format!(
-                        "writing new id to file {:?}",
-                        args.write_new_ids.as_ref().unwrap()
-                    )
-                })?;
+    match args.cmd {
+        Cmd::GetArticles(args) => {
+            if args.wikidata_ids.is_none()
+                && args.wikipedia_urls.is_none()
+                && args.osm_tags.is_none()
+            {
+                let mut cmd = Args::command();
+                cmd.error(
+                    clap::error::ErrorKind::MissingRequiredArgument,
+                    "at least one of --osm-tags --wikidata-ids --wikipedia-urls is required",
+                )
+                .exit()
             }
-        }
 
-        if let Err(e) = write(&args.output_dir, &page, matching_titles) {
-            error!("Error writing article {:?}: {:#}", page.name, e);
+            get_articles::run(args)
+        }
+        Cmd::GetTags => todo!(),
+        Cmd::Simplify { lang } => {
+            use std::io::{stdin, stdout, Read, Write};
+
+            let mut input = String::new();
+            stdin().read_to_string(&mut input)?;
+
+            let output = om_wikiparser::html::simplify(&input, &lang);
+
+            stdout().write_all(output.as_bytes())?;
+
+            Ok(())
         }
     }
-
-    Ok(())
 }