2024-04-28 18:20:52 +00:00
7 changed files with 245 additions and 115 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -301,6 +301,15 @@ dependencies = [
 "memchr",
 ]

+[[package]]
+name = "deranged"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
+dependencies = [
+ "powerfmt",
+]
+
 [[package]]
 name = "derive_more"
 version = "0.99.17"
@ -679,6 +688,12 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "num-conv"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
+
 [[package]]
 name = "num_cpus"
 version = "1.16.0"
@ -718,6 +733,7 @@ dependencies = [
 "serde_json",
 "thiserror",
 "tracing",
+ "tracing-logfmt",
 "tracing-subscriber",
 "unicode-normalization",
 "url",
@ -877,6 +893,12 @@ version = "0.2.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"

+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@ -1214,18 +1236,18 @@ checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"

 [[package]]
 name = "serde"
-version = "1.0.163"
+version = "1.0.193"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
+checksum = "25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89"
 dependencies = [
 "serde_derive",
 ]

 [[package]]
 name = "serde_derive"
-version = "1.0.163"
+version = "1.0.193"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
+checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3"
 dependencies = [
 "proc-macro2",
 "quote",
@ -1388,6 +1410,37 @@ dependencies = [
 "once_cell",
 ]

+[[package]]
+name = "time"
+version = "0.3.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749"
+dependencies = [
+ "deranged",
+ "itoa",
+ "num-conv",
+ "powerfmt",
+ "serde",
+ "time-core",
+ "time-macros",
+]
+
+[[package]]
+name = "time-core"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
+
+[[package]]
+name = "time-macros"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774"
+dependencies = [
+ "num-conv",
+ "time-core",
+]
+
 [[package]]
 name = "tinyvec"
 version = "1.6.0"
@ -1447,6 +1500,18 @@ dependencies = [
 "tracing-core",
 ]

+[[package]]
+name = "tracing-logfmt"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22b8e455f6caa5212a102ec530bf86b8dc5a4c536299bffd84b238fed9119be7"
+dependencies = [
+ "time",
+ "tracing",
+ "tracing-core",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "tracing-subscriber"
 version = "0.3.17"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -25,6 +25,7 @@ serde_json = "1.0.96"
 thiserror = "1.0.44"
 tracing = "0.1.37"
 tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
+tracing-logfmt = "0.3.4"
 url = "2.3.1"
 urlencoding = "2.1.2"

--- a/src/extend.rs
+++ b/src/extend.rs
@ -0,0 +1,38 @@
+//! Utilities for working with [Extend].
+use std::iter::Extend;
+
+/// Calls `f` for each `Item`.
+///
+/// ```
+/// # use om_wikiparser::extend;
+/// let mut count = 0;
+///
+/// extend::from_fn(|_| count += 1).extend(std::iter::zip(
+///     [1, 2, 3, 4],
+///     ['a', 'b', 'c']));
+/// assert_eq!(count, 3);
+/// ```
+pub fn from_fn<Item, F: FnMut(Item)>(f: F) -> FromFn<F> {
+    FromFn(f)
+}
+
+pub struct FromFn<F>(F);
+impl<Item, F: FnMut(Item)> Extend<Item> for FromFn<F> {
+    fn extend<T: IntoIterator<Item = Item>>(&mut self, iter: T) {
+        for item in iter {
+            self.0(item);
+        }
+    }
+}
+
+/// Iterates but drops each `Item`.
+pub fn sink() -> Sink {
+    Sink(())
+}
+
+pub struct Sink(());
+impl<Item> Extend<Item> for Sink {
+    fn extend<T: IntoIterator<Item = Item>>(&mut self, iter: T) {
+        for _item in iter {}
+    }
+}
--- a/src/get_articles.rs
+++ b/src/get_articles.rs
@ -1,6 +1,8 @@
 use std::{
+    borrow::Cow,
+    collections::HashSet,
    fs::{self, File},
-    io::{stdin, BufRead, Write},
+    io::{stdin, stdout, BufRead, BufReader, Write},
    os::unix,
    path::{Path, PathBuf},
 };
@ -8,18 +10,34 @@ use std::{
 use anyhow::{anyhow, bail, Context};

 use om_wikiparser::{
+    extend,
    html::{self, HtmlError},
    parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file,
    wm::{Page, Title},
 };

+#[derive(clap::ValueEnum, Copy, Clone)]
+pub enum ArticleFilter {
+    /// All articles that match on title/QID
+    Match,
+    /// Articles that cannot be simplified
+    Error,
+    /// Articles that cause panics when simplified
+    Panic, // FIXME: move panic dumping to this
+}
+
 /// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
 ///
 /// Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
 #[derive(clap::Args)]
 pub struct Args {
    /// Directory to write the extracted articles to.
-    pub output_dir: PathBuf,
+    #[arg(required_unless_present = "passthrough")]
+    pub output_dir: Option<PathBuf>,
+
+    /// Copy input article JSON to stdout if it matches certain criteria.
+    #[arg(long)]
+    pub passthrough: Option<ArticleFilter>,

    /// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
    ///
@ -51,34 +69,34 @@ pub struct Args {
 }

 pub fn run(args: Args) -> anyhow::Result<()> {
-    let mut wikipedia_titles = if let Some(path) = args.wikipedia_urls {
+    let mut wikipedia_titles = HashSet::new();
+    if let Some(path) = args.wikipedia_urls {
        info!("Loading article urls from {path:?}");
-        parse_wikipedia_file(path)?
-    } else {
-        Default::default()
-    };
+        let file = BufReader::new(File::open(path)?);
+        parse_wikipedia_file(file, &mut wikipedia_titles)?
+    }

-    let mut wikidata_qids = if let Some(path) = args.wikidata_qids {
+    let mut wikidata_qids = HashSet::new();
+    if let Some(path) = args.wikidata_qids {
        info!("Loading wikidata QIDs from {path:?}");
-        parse_wikidata_file(path)?
-    } else {
-        Default::default()
+        let file = BufReader::new(File::open(path)?);
+        parse_wikidata_file(file, &mut wikidata_qids)?
    };

    if let Some(ref path) = args.osm_tags {
        info!("Loading wikipedia/wikidata osm tags from {path:?}");
+        let file = File::open(path)?;

        let original_items = wikidata_qids.len() + wikipedia_titles.len();
-        let mut line_errors = Vec::new();
+        let mut error_count = 0;
        parse_osm_tag_file(
-            path,
+            file,
            &mut wikidata_qids,
            &mut wikipedia_titles,
-            Some(&mut line_errors),
+            &mut extend::from_fn(|_| error_count += 1),
        )?;

-        if !line_errors.is_empty() {
-            let error_count = line_errors.len();
+        if error_count != 0 {
            let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items;
            let percentage = 100.0 * error_count as f64 / new_items as f64;
            warn!("{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",);
@ -103,10 +121,14 @@ pub fn run(args: Args) -> anyhow::Result<()> {
        .map(|p| File::options().create(true).append(true).open(p))
        .transpose()?;

-    if !args.output_dir.is_dir() {
-        bail!("output dir {:?} does not exist", args.output_dir)
+    if let Some(output_dir) = &args.output_dir {
+        if !output_dir.is_dir() {
+            bail!("output dir {:?} does not exist", output_dir);
+        }
    }

+    let mut stdout = stdout();
+
    info!("Processing dump");
    let mut dump = stdin().lock();

@ -179,8 +201,36 @@ pub fn run(args: Args) -> anyhow::Result<()> {
            }
        }

-        if let Err(e) = write(&args.output_dir, &page, matching_titles, !args.no_simplify) {
-            error!("Error writing article: {:#}", e);
+        // Always write regardless of later errors.
+        if let Some(ArticleFilter::Match) = args.passthrough {
+            stdout.write_all(buffer.as_bytes())?;
+        }
+
+        let article_output = if args.no_simplify {
+            Ok(Cow::Borrowed(&page.article_body.html))
+        } else {
+            html::process_str(&page.article_body.html, &page.in_language.identifier).map(Cow::Owned)
+        };
+
+        match article_output {
+            Err(e) => {
+                error!("Error processing article: {:#}", e);
+                if let Some(filter) = args.passthrough {
+                    match (e, filter) {
+                        (_, ArticleFilter::Error) | (HtmlError::Panic(_), ArticleFilter::Panic) => {
+                            stdout.write_all(buffer.as_bytes())?
+                        }
+                        _ => {}
+                    }
+                }
+            }
+            Ok(html) => {
+                if let Some(output_dir) = args.output_dir.as_ref() {
+                    if let Err(e) = write(output_dir, &page, matching_titles, &html) {
+                        error!("Error writing article: {:#}", e);
+                    }
+                }
+            }
        }
    }

@ -275,35 +325,8 @@ fn write(
    base: impl AsRef<Path>,
    page: &Page,
    redirects: impl IntoIterator<Item = Title>,
-    simplify: bool,
+    html: &str,
 ) -> anyhow::Result<()> {
-    let html = if !simplify {
-        page.article_body.html.to_string()
-    } else {
-        match html::process_str(&page.article_body.html, &page.in_language.identifier) {
-            Ok(html) => html,
-            Err(HtmlError::Panic(msg)) => {
-                // Write original article text to disk
-                let mut error_file = base.as_ref().to_path_buf();
-                error_file.push("errors");
-                if !error_file.exists() {
-                    fs::create_dir(&error_file).context("creating error directory")?;
-                }
-                error_file.push(page.name.replace('/', "%2F"));
-                error_file.set_extension("html");
-
-                fs::write(&error_file, &page.article_body.html).context("writing error file")?;
-
-                if !msg.is_empty() {
-                    bail!("panic occurred while processing html (saved to {error_file:?}): {msg}");
-                } else {
-                    bail!("panic occurred while processing html (saved to {error_file:?})");
-                }
-            }
-            Err(e) => bail!(e),
-        }
-    };
-
    let article_dir = create_article_dir(&base, page, redirects)?;

    // Write html to determined file.
@ -311,11 +334,11 @@ fn write(
    filename.push(&page.in_language.identifier);
    filename.set_extension("html");

-    debug!("{:?}: {:?}", page.name, filename);
-
-    if filename.exists() {
-        debug!("Overwriting existing file");
-    }
+    debug!(
+        file = filename.to_string_lossy().as_ref(),
+        exists = filename.exists(),
+        "Writing article"
+    );

    let mut file =
        File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,57 +1,52 @@
-use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr};
+use std::{
+    io::{self, BufRead},
+    str::FromStr,
+};

 #[macro_use]
 extern crate log;
-use anyhow::Context;

 pub mod html;
 pub mod osm;
 mod tag_file;
 pub use tag_file::*;
+pub mod extend;
 pub mod wm;

 use wm::{Qid, Title};

 /// Read from a file of urls on each line.
-pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
-    let contents = fs::read_to_string(path.as_ref())?;
-    Ok(contents
-        .lines()
-        .enumerate()
-        .map(|(i, line)| {
-            Qid::from_str(line).with_context(|| {
-                let line_num = i + 1;
-                format!("on line {line_num}: {line:?}")
-            })
-        })
-        .filter_map(|r| match r {
-            Ok(qid) => Some(qid),
+pub fn parse_wikidata_file(r: impl BufRead, collection: &mut impl Extend<Qid>) -> io::Result<()> {
+    for (i, line) in r.lines().enumerate() {
+        let line = line?;
+        match Qid::from_str(&line) {
+            Ok(qid) => collection.extend(Some(qid)),
            Err(e) => {
-                warn!("Could not parse QID: {:#}", e);
-                None
+                let line_num = i + 1;
+                warn!("Could not parse QID: on line {line_num}: {line:?}: {:#}", e);
            }
-        })
-        .collect())
+        }
+    }
+    Ok(())
 }

 /// Read article titles from a file of urls on each line.
-pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
-    let contents = fs::read_to_string(path.as_ref())?;
-    Ok(contents
-        .lines()
-        .enumerate()
-        .map(|(i, line)| {
-            Title::from_url(line).with_context(|| {
-                let line_num = i + 1;
-                format!("on line {line_num}: {line:?}")
-            })
-        })
-        .filter_map(|r| match r {
-            Ok(qid) => Some(qid),
+pub fn parse_wikipedia_file(
+    r: impl BufRead,
+    collection: &mut impl Extend<Title>,
+) -> io::Result<()> {
+    for (i, line) in r.lines().enumerate() {
+        let line = line?;
+        match Title::from_osm_tag(&line) {
+            Ok(title) => collection.extend(Some(title)),
            Err(e) => {
-                warn!("Could not parse wikipedia title: {:#}", e);
-                None
+                let line_num = i + 1;
+                warn!(
+                    "Could not parse wikipedia title: on line {line_num}: {line:?}: {:#}",
+                    e
+                );
            }
-        })
-        .collect())
+        }
+    }
+    Ok(())
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -15,7 +15,7 @@ use anyhow::Context;
 use clap::{CommandFactory, Parser, Subcommand};
 #[macro_use]
 extern crate tracing;
-use tracing_subscriber::filter::EnvFilter;
+use tracing_subscriber::{filter::EnvFilter, Layer};

 use om_wikiparser::osm;

@ -77,13 +77,7 @@ enum Cmd {
 }

 fn main() -> anyhow::Result<()> {
-    // Use info level by default, load overrides from `RUST_LOG` env variable.
-    // See https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html
-    tracing_subscriber::fmt()
-        .with_env_filter(EnvFilter::from_default_env())
-        .compact()
-        .with_writer(stderr)
-        .init();
+    init_logger();

    let args = Args::parse();

@ -126,7 +120,8 @@ fn main() -> anyhow::Result<()> {
            let mut titles = HashSet::new();
            let mut errors = Vec::new();
            info!("Reading osm tag file");
-            om_wikiparser::parse_osm_tag_file(osm_tags, &mut qids, &mut titles, Some(&mut errors))?;
+            let file = File::open(osm_tags)?;
+            om_wikiparser::parse_osm_tag_file(file, &mut qids, &mut titles, &mut errors)?;
            info!("Found {} errors in tag file", errors.len());

            let mut writer = csv::WriterBuilder::new()
@ -215,6 +210,23 @@ fn main() -> anyhow::Result<()> {
    }
 }

+fn init_logger() {
+    use tracing::dispatcher::{self, Dispatch};
+    use tracing_subscriber::{layer::SubscriberExt, Registry};
+
+    let subscriber = Registry::default().with(
+        tracing_logfmt::builder()
+            .layer()
+            .with_writer(stderr)
+            // Use info level by default, load overrides from `RUST_LOG` env variable.
+            // See https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html
+            .with_filter(EnvFilter::from_default_env()),
+    );
+
+    dispatcher::set_global_default(Dispatch::new(subscriber))
+        .expect("Global logger has already been set!");
+}
+
 /// Determine the number of threads to use.
 ///
 /// If `requested` is <= 0, then the number of cores plus `requested` will be created.
--- a/src/tag_file.rs
+++ b/src/tag_file.rs
@ -1,4 +1,4 @@
-use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, str::FromStr};
+use std::{error::Error, fmt::Display, io::Read, str::FromStr};

 use anyhow::{anyhow, bail};

@ -9,19 +9,15 @@ use crate::{

 /// Read a TSV file of OSM tags, using wikipedia/wikidata tags.
 pub fn parse_osm_tag_file(
-    path: impl AsRef<OsStr>,
-    qids: &mut HashSet<Qid>,
-    titles: &mut HashSet<Title>,
-    mut line_errors: Option<&mut Vec<ParseLineError>>,
+    r: impl Read,
+    qids: &mut impl Extend<Qid>,
+    titles: &mut impl Extend<Title>,
+    line_errors: &mut impl Extend<ParseLineError>,
 ) -> anyhow::Result<()> {
-    let path = path.as_ref();
-    let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
+    let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_reader(r);

    let mut push_error = |e: ParseLineError| {
-        debug!("Tag parse error: {e}");
-        if let Some(ref mut errs) = line_errors {
-            errs.push(e);
-        }
+        line_errors.extend(Some(e));
    };

    let mut qid_col = None;
@ -84,7 +80,7 @@ pub fn parse_osm_tag_file(
        if !qid.is_empty() {
            match Qid::from_str(qid) {
                Ok(qid) => {
-                    qids.insert(qid);
+                    qids.extend(Some(qid));
                }
                Err(e) => {
                    let (osm_id, osm_type, osm_version) = parse_metadata();
@ -104,7 +100,7 @@ pub fn parse_osm_tag_file(
        if !title.is_empty() {
            match Title::from_osm_tag(title) {
                Ok(title) => {
-                    titles.insert(title);
+                    titles.extend(Some(title));
                }
                Err(e) => {
                    let (osm_id, osm_type, osm_version) = parse_metadata();