Add option to dump new QIDs to file

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
2023-07-11 12:02:19 -04:00 · 2023-07-11 12:02:19 -04:00 · 7d287bd5a4
commit 7d287bd5a4
parent 45efd77c0d
2 changed files with 90 additions and 24 deletions
--- a/README.md
+++ b/README.md
@ -19,6 +19,35 @@ Alternatively, build it with `cargo build --release`, which places the binary in

 Run the program with the `--help` flag to see all supported arguments.

+```shell
+$ cargo run --release -- --help
+Extract article HTML from Wikipedia Enterprise HTML dumps.
+
+Expects an uncompressed dump connected to stdin.
+
+Usage: om-wikiparser [OPTIONS] <OUTPUT_DIR>
+
+Arguments:
+  <OUTPUT_DIR>
+          Directory to write the extracted articles to
+
+Options:
+      --wikidata-ids <WIKIDATA_IDS>
+          File of Wikidata QIDs to extract, one per line (e.g. `Q12345`)
+
+      --wikipedia-urls <WIKIPEDIA_URLS>
+          File of Wikipedia article titles to extract, one per line (e.g. `https://lang.wikipedia.org/wiki/Article_Title`)
+
+      --write-new-ids <WRITE_NEW_IDS>
+          Append QIDs of articles matched by title but not QID to the provided file
+
+  -h, --help
+          Print help (see a summary with '-h')
+
+  -V, --version
+          Print version
+```
+
 It takes as inputs:
 - A wikidata enterprise JSON dump, extracted and connected to `stdin`.
 - A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
--- a/src/main.rs
+++ b/src/main.rs
@ -15,13 +15,23 @@ use om_wikiparser::{
    wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
 };

+/// Extract article HTML from Wikipedia Enterprise HTML dumps.
+///
+/// Expects an uncompressed dump connected to stdin.
 #[derive(Parser)]
+#[command(version)]
 struct Args {
+    /// Directory to write the extracted articles to.
    output_dir: PathBuf,
+    /// File of Wikidata QIDs to extract, one per line (e.g. `Q12345`).
    #[arg(long)]
    wikidata_ids: Option<PathBuf>,
+    /// File of Wikipedia article titles to extract, one per line (e.g. `https://lang.wikipedia.org/wiki/Article_Title`).
    #[arg(long)]
    wikipedia_urls: Option<PathBuf>,
+    /// Append QIDs of articles matched by title but not QID to the provided file.
+    #[arg(long, requires("wikipedia_urls"))]
+    write_new_ids: Option<PathBuf>,
 }

 /// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
@ -147,19 +157,29 @@ fn main() -> anyhow::Result<()> {

    let args = Args::parse();

-    info!("Loading urls");
-    let wikipedia_titles = args
-        .wikipedia_urls
-        .map(parse_wikipedia_file)
-        .transpose()?
-        .unwrap_or_default();
+    let wikipedia_titles = if let Some(path) = args.wikipedia_urls {
+        info!("Loading article urls from {path:?}");
+        let urls = parse_wikipedia_file(path)?;
+        debug!("Parsed {} unique article urls", urls.len());
+        urls
+    } else {
+        Default::default()
+    };

-    info!("Loading ids");
-    let wikidata_ids = args
-        .wikidata_ids
-        .map(parse_wikidata_file)
-        .transpose()?
-        .unwrap_or_default();
+    let wikidata_ids = if let Some(path) = args.wikidata_ids {
+        info!("Loading wikidata ids from {path:?}");
+        let ids = parse_wikidata_file(path)?;
+        debug!("Parsed {} unique wikidata ids", ids.len());
+        ids
+    } else {
+        Default::default()
+    };
+
+    let mut write_new_ids = args
+        .write_new_ids
+        .as_ref()
+        .map(|p| File::options().create(true).append(true).open(p))
+        .transpose()?;

    if !args.output_dir.is_dir() {
        bail!("output dir {:?} does not exist", args.output_dir)
@ -179,26 +199,43 @@ fn main() -> anyhow::Result<()> {
    for page in stream {
        let page = page?;

-        let is_wikidata_match = page
-            .wikidata()
-            .map(|qid| wikidata_ids.contains(&qid))
+        let qid = page.wikidata();
+
+        let is_wikidata_match = qid
+            .as_ref()
+            .map(|qid| wikidata_ids.contains(qid))
            .unwrap_or_default();

-        let matching_titles = page
-            .all_titles()
-            .filter_map(|r| {
-                r.map(Some).unwrap_or_else(|e| {
-                    warn!("Could not parse title for {:?}: {:#}", &page.name, e);
-                    None
+        let matching_titles = if wikipedia_titles.is_empty() {
+            Default::default()
+        } else {
+            page.all_titles()
+                .filter_map(|r| {
+                    r.map(Some).unwrap_or_else(|e| {
+                        warn!("Could not parse title for {:?}: {:#}", &page.name, e);
+                        None
+                    })
                })
-            })
-            .filter(|t| wikipedia_titles.contains(t))
-            .collect::<Vec<_>>();
+                .filter(|t| wikipedia_titles.contains(t))
+                .collect::<Vec<_>>()
+        };

        if !is_wikidata_match && matching_titles.is_empty() {
            continue;
        }

+        if let (Some(f), Some(qid)) = (&mut write_new_ids, &qid) {
+            if !is_wikidata_match && !matching_titles.is_empty() {
+                debug!("Writing new id {} for article {:?}", qid, page.name);
+                writeln!(f, "{}", qid).with_context(|| {
+                    format!(
+                        "writing new id to file {:?}",
+                        args.write_new_ids.as_ref().unwrap()
+                    )
+                })?;
+            }
+        }
+
        if let Err(e) = write(&args.output_dir, &page, matching_titles) {
            error!("Error writing article {:?}: {:#}", page.name, e);
        }