Add option to dump new QIDs (#20)

This allows us to extract articles that we know the title of but not the QID of from other language's dumps in a another pass. Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
2023-07-13 14:04:52 -04:00 · 2023-07-13 14:04:52 -04:00 · 75f4f6a21b
commit 75f4f6a21b
parent 45efd77c0d
3 changed files with 133 additions and 34 deletions
--- a/README.md
+++ b/README.md
@ -19,6 +19,38 @@ Alternatively, build it with `cargo build --release`, which places the binary in

 Run the program with the `--help` flag to see all supported arguments.

+```shell
+$ cargo run --release -- --help
+Extract article HTML from Wikipedia Enterprise HTML dumps.
+
+Expects an uncompressed dump connected to stdin.
+
+Usage: om-wikiparser [OPTIONS] <OUTPUT_DIR>
+
+Arguments:
+  <OUTPUT_DIR>
+          Directory to write the extracted articles to
+
+Options:
+      --write-new-ids <WRITE_NEW_IDS>
+          Append to the provided file path the QIDs of articles matched by title but not QID.
+
+          Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
+
+  -h, --help
+          Print help (see a summary with '-h')
+
+  -V, --version
+          Print version
+
+FILTERS:
+      --wikidata-ids <WIKIDATA_IDS>
+          Path to file that contains a Wikidata QID to extract on each line (e.g. `Q12345`)
+
+      --wikipedia-urls <WIKIPEDIA_URLS>
+          Path to file that contains a Wikipedia article url to extract on each line (e.g. `https://lang.wikipedia.org/wiki/Article_Title`)
+```
+
 It takes as inputs:
 - A wikidata enterprise JSON dump, extracted and connected to `stdin`.
 - A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
--- a/src/main.rs
+++ b/src/main.rs
@ -6,7 +6,7 @@ use std::{
 };

 use anyhow::{anyhow, bail, Context};
-use clap::Parser;
+use clap::{CommandFactory, Parser};
 #[macro_use]
 extern crate log;

@ -15,13 +15,31 @@ use om_wikiparser::{
    wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
 };

+/// Extract article HTML from Wikipedia Enterprise HTML dumps.
+///
+/// Expects an uncompressed dump connected to stdin.
 #[derive(Parser)]
+#[command(version)]
 struct Args {
+    /// Directory to write the extracted articles to.
    output_dir: PathBuf,
-    #[arg(long)]
+
+    /// Path to file that contains a Wikidata QID to extract on each line
+    /// (e.g. `Q12345`).
+    #[arg(long, help_heading = "FILTERS")]
    wikidata_ids: Option<PathBuf>,
-    #[arg(long)]
+
+    /// Path to file that contains a Wikipedia article url to extract on each line
+    /// (e.g. `https://lang.wikipedia.org/wiki/Article_Title`).
+    #[arg(long, help_heading = "FILTERS")]
    wikipedia_urls: Option<PathBuf>,
+
+    /// Append to the provided file path the QIDs of articles matched by title but not QID.
+    ///
+    /// Use this to save the QIDs of articles you know the url of, but not the QID.
+    /// The same path can later be passed to the `--wikidata-ids` option to extract them from another language's dump.
+    #[arg(long, requires("wikipedia_urls"))]
+    write_new_ids: Option<PathBuf>,
 }

 /// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
@ -65,7 +83,6 @@ fn create_article_dir(
        .with_context(|| format!("creating main directory {:?}", &main_dir))?;

    // Write symlinks to main directory.
-    // TODO: Only write redirects that we care about.
    for title in redirects {
        let wikipedia_dir = title.get_dir(base.to_owned());

@ -147,19 +164,38 @@ fn main() -> anyhow::Result<()> {

    let args = Args::parse();

-    info!("Loading urls");
-    let wikipedia_titles = args
-        .wikipedia_urls
-        .map(parse_wikipedia_file)
-        .transpose()?
-        .unwrap_or_default();
+    if args.wikidata_ids.is_none() && args.wikipedia_urls.is_none() {
+        let mut cmd = Args::command();
+        cmd.error(
+            clap::error::ErrorKind::MissingRequiredArgument,
+            "one or both of --wikidata-ids and --wikipedia-urls is required",
+        )
+        .exit()
+    }

-    info!("Loading ids");
-    let wikidata_ids = args
-        .wikidata_ids
-        .map(parse_wikidata_file)
-        .transpose()?
-        .unwrap_or_default();
+    let wikipedia_titles = if let Some(path) = args.wikipedia_urls {
+        info!("Loading article urls from {path:?}");
+        let urls = parse_wikipedia_file(path)?;
+        debug!("Parsed {} unique article urls", urls.len());
+        urls
+    } else {
+        Default::default()
+    };
+
+    let wikidata_ids = if let Some(path) = args.wikidata_ids {
+        info!("Loading wikidata ids from {path:?}");
+        let ids = parse_wikidata_file(path)?;
+        debug!("Parsed {} unique wikidata ids", ids.len());
+        ids
+    } else {
+        Default::default()
+    };
+
+    let mut write_new_ids = args
+        .write_new_ids
+        .as_ref()
+        .map(|p| File::options().create(true).append(true).open(p))
+        .transpose()?;

    if !args.output_dir.is_dir() {
        bail!("output dir {:?} does not exist", args.output_dir)
@ -179,26 +215,43 @@ fn main() -> anyhow::Result<()> {
    for page in stream {
        let page = page?;

-        let is_wikidata_match = page
-            .wikidata()
-            .map(|qid| wikidata_ids.contains(&qid))
+        let qid = page.wikidata();
+
+        let is_wikidata_match = qid
+            .as_ref()
+            .map(|qid| wikidata_ids.contains(qid))
            .unwrap_or_default();

-        let matching_titles = page
-            .all_titles()
-            .filter_map(|r| {
-                r.map(Some).unwrap_or_else(|e| {
-                    warn!("Could not parse title for {:?}: {:#}", &page.name, e);
-                    None
+        let matching_titles = if wikipedia_titles.is_empty() {
+            Default::default()
+        } else {
+            page.all_titles()
+                .filter_map(|r| {
+                    r.map(Some).unwrap_or_else(|e| {
+                        warn!("Could not parse title for {:?}: {:#}", &page.name, e);
+                        None
+                    })
                })
-            })
-            .filter(|t| wikipedia_titles.contains(t))
-            .collect::<Vec<_>>();
+                .filter(|t| wikipedia_titles.contains(t))
+                .collect::<Vec<_>>()
+        };

        if !is_wikidata_match && matching_titles.is_empty() {
            continue;
        }

+        if let (Some(f), Some(qid)) = (&mut write_new_ids, &qid) {
+            if !is_wikidata_match && !matching_titles.is_empty() {
+                debug!("Writing new id {} for article {:?}", qid, page.name);
+                writeln!(f, "{}", qid).with_context(|| {
+                    format!(
+                        "writing new id to file {:?}",
+                        args.write_new_ids.as_ref().unwrap()
+                    )
+                })?;
+            }
+        }
+
        if let Err(e) = write(&args.output_dir, &page, matching_titles) {
            error!("Error writing article {:?}: {:#}", page.name, e);
        }
--- a/src/wm/mod.rs
+++ b/src/wm/mod.rs
@ -14,16 +14,23 @@ pub use page::Page;
 /// Read from a file of urls on each line.
 pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<WikidataQid>> {
    let contents = fs::read_to_string(path.as_ref())?;
-    contents
+    Ok(contents
        .lines()
        .enumerate()
        .map(|(i, line)| {
            WikidataQid::from_str(line).with_context(|| {
                let line_num = i + 1;
-                format!("bad QID value on line {line_num}: {line:?}")
+                format!("on line {line_num}: {line:?}")
            })
        })
-        .collect()
+        .filter_map(|r| match r {
+            Ok(qid) => Some(qid),
+            Err(e) => {
+                warn!("Could not parse QID: {:#}", e);
+                None
+            }
+        })
+        .collect())
 }

 /// Read article titles from a file of urls on each line.
@ -31,16 +38,23 @@ pub fn parse_wikipedia_file(
    path: impl AsRef<OsStr>,
 ) -> anyhow::Result<HashSet<WikipediaTitleNorm>> {
    let contents = fs::read_to_string(path.as_ref())?;
-    contents
+    Ok(contents
        .lines()
        .enumerate()
        .map(|(i, line)| {
            WikipediaTitleNorm::from_url(line).with_context(|| {
                let line_num = i + 1;
-                format!("bad wikipedia url on line {line_num}: {line:?}")
+                format!("on line {line_num}: {line:?}")
            })
        })
-        .collect()
+        .filter_map(|r| match r {
+            Ok(qid) => Some(qid),
+            Err(e) => {
+                warn!("Could not parse wikipedia title: {:#}", e);
+                None
+            }
+        })
+        .collect())
 }

 /// Wikidata QID/Q Number