diff --git a/Cargo.lock b/Cargo.lock index ad954f6..65a789f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -301,6 +301,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +dependencies = [ + "powerfmt", +] + [[package]] name = "derive_more" version = "0.99.17" @@ -679,6 +688,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + [[package]] name = "num_cpus" version = "1.16.0" @@ -718,6 +733,7 @@ dependencies = [ "serde_json", "thiserror", "tracing", + "tracing-logfmt", "tracing-subscriber", "unicode-normalization", "url", @@ -877,6 +893,12 @@ version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -1214,18 +1236,18 @@ checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" [[package]] name = "serde" -version = "1.0.163" +version = "1.0.193" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2" +checksum = "25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.163" +version = "1.0.193" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e" +checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" dependencies = [ "proc-macro2", "quote", @@ -1388,6 +1410,37 @@ dependencies = [ "once_cell", ] +[[package]] +name = "time" +version = "0.3.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" + +[[package]] +name = "time-macros" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774" +dependencies = [ + "num-conv", + "time-core", +] + [[package]] name = "tinyvec" version = "1.6.0" @@ -1447,6 +1500,18 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-logfmt" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22b8e455f6caa5212a102ec530bf86b8dc5a4c536299bffd84b238fed9119be7" +dependencies = [ + "time", + "tracing", + "tracing-core", + "tracing-subscriber", +] + [[package]] name = "tracing-subscriber" version = "0.3.17" diff --git a/Cargo.toml b/Cargo.toml index 3baec8f..d49f10f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ serde_json = "1.0.96" thiserror = "1.0.44" tracing = "0.1.37" tracing-subscriber = { version = "0.3.17", features = ["env-filter"] } +tracing-logfmt = "0.3.4" url = "2.3.1" urlencoding = "2.1.2" diff --git a/src/extend.rs b/src/extend.rs new file mode 100644 index 0000000..523c9bb --- /dev/null +++ b/src/extend.rs @@ -0,0 +1,38 @@ +//! Utilities for working with [Extend]. +use std::iter::Extend; + +/// Calls `f` for each `Item`. +/// +/// ``` +/// # use om_wikiparser::extend; +/// let mut count = 0; +/// +/// extend::from_fn(|_| count += 1).extend(std::iter::zip( +/// [1, 2, 3, 4], +/// ['a', 'b', 'c'])); +/// assert_eq!(count, 3); +/// ``` +pub fn from_fn(f: F) -> FromFn { + FromFn(f) +} + +pub struct FromFn(F); +impl Extend for FromFn { + fn extend>(&mut self, iter: T) { + for item in iter { + self.0(item); + } + } +} + +/// Iterates but drops each `Item`. +pub fn sink() -> Sink { + Sink(()) +} + +pub struct Sink(()); +impl Extend for Sink { + fn extend>(&mut self, iter: T) { + for _item in iter {} + } +} diff --git a/src/get_articles.rs b/src/get_articles.rs index 8730acf..4add8c9 100644 --- a/src/get_articles.rs +++ b/src/get_articles.rs @@ -1,6 +1,8 @@ use std::{ + borrow::Cow, + collections::HashSet, fs::{self, File}, - io::{stdin, BufRead, Write}, + io::{stdin, stdout, BufRead, BufReader, Write}, os::unix, path::{Path, PathBuf}, }; @@ -8,18 +10,34 @@ use std::{ use anyhow::{anyhow, bail, Context}; use om_wikiparser::{ + extend, html::{self, HtmlError}, parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, wm::{Page, Title}, }; +#[derive(clap::ValueEnum, Copy, Clone)] +pub enum ArticleFilter { + /// All articles that match on title/QID + Match, + /// Articles that cannot be simplified + Error, + /// Articles that cause panics when simplified + Panic, // FIXME: move panic dumping to this +} + /// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps. /// /// Expects an uncompressed dump (newline-delimited JSON) connected to stdin. #[derive(clap::Args)] pub struct Args { /// Directory to write the extracted articles to. - pub output_dir: PathBuf, + #[arg(required_unless_present = "passthrough")] + pub output_dir: Option, + + /// Copy input article JSON to stdout if it matches certain criteria. + #[arg(long)] + pub passthrough: Option, /// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns. /// @@ -51,34 +69,34 @@ pub struct Args { } pub fn run(args: Args) -> anyhow::Result<()> { - let mut wikipedia_titles = if let Some(path) = args.wikipedia_urls { + let mut wikipedia_titles = HashSet::new(); + if let Some(path) = args.wikipedia_urls { info!("Loading article urls from {path:?}"); - parse_wikipedia_file(path)? - } else { - Default::default() - }; + let file = BufReader::new(File::open(path)?); + parse_wikipedia_file(file, &mut wikipedia_titles)? + } - let mut wikidata_qids = if let Some(path) = args.wikidata_qids { + let mut wikidata_qids = HashSet::new(); + if let Some(path) = args.wikidata_qids { info!("Loading wikidata QIDs from {path:?}"); - parse_wikidata_file(path)? - } else { - Default::default() + let file = BufReader::new(File::open(path)?); + parse_wikidata_file(file, &mut wikidata_qids)? }; if let Some(ref path) = args.osm_tags { info!("Loading wikipedia/wikidata osm tags from {path:?}"); + let file = File::open(path)?; let original_items = wikidata_qids.len() + wikipedia_titles.len(); - let mut line_errors = Vec::new(); + let mut error_count = 0; parse_osm_tag_file( - path, + file, &mut wikidata_qids, &mut wikipedia_titles, - Some(&mut line_errors), + &mut extend::from_fn(|_| error_count += 1), )?; - if !line_errors.is_empty() { - let error_count = line_errors.len(); + if error_count != 0 { let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items; let percentage = 100.0 * error_count as f64 / new_items as f64; warn!("{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",); @@ -103,10 +121,14 @@ pub fn run(args: Args) -> anyhow::Result<()> { .map(|p| File::options().create(true).append(true).open(p)) .transpose()?; - if !args.output_dir.is_dir() { - bail!("output dir {:?} does not exist", args.output_dir) + if let Some(output_dir) = &args.output_dir { + if !output_dir.is_dir() { + bail!("output dir {:?} does not exist", output_dir); + } } + let mut stdout = stdout(); + info!("Processing dump"); let mut dump = stdin().lock(); @@ -179,8 +201,36 @@ pub fn run(args: Args) -> anyhow::Result<()> { } } - if let Err(e) = write(&args.output_dir, &page, matching_titles, !args.no_simplify) { - error!("Error writing article: {:#}", e); + // Always write regardless of later errors. + if let Some(ArticleFilter::Match) = args.passthrough { + stdout.write_all(buffer.as_bytes())?; + } + + let article_output = if args.no_simplify { + Ok(Cow::Borrowed(&page.article_body.html)) + } else { + html::process_str(&page.article_body.html, &page.in_language.identifier).map(Cow::Owned) + }; + + match article_output { + Err(e) => { + error!("Error processing article: {:#}", e); + if let Some(filter) = args.passthrough { + match (e, filter) { + (_, ArticleFilter::Error) | (HtmlError::Panic(_), ArticleFilter::Panic) => { + stdout.write_all(buffer.as_bytes())? + } + _ => {} + } + } + } + Ok(html) => { + if let Some(output_dir) = args.output_dir.as_ref() { + if let Err(e) = write(output_dir, &page, matching_titles, &html) { + error!("Error writing article: {:#}", e); + } + } + } } } @@ -275,35 +325,8 @@ fn write( base: impl AsRef, page: &Page, redirects: impl IntoIterator, - simplify: bool, + html: &str, ) -> anyhow::Result<()> { - let html = if !simplify { - page.article_body.html.to_string() - } else { - match html::process_str(&page.article_body.html, &page.in_language.identifier) { - Ok(html) => html, - Err(HtmlError::Panic(msg)) => { - // Write original article text to disk - let mut error_file = base.as_ref().to_path_buf(); - error_file.push("errors"); - if !error_file.exists() { - fs::create_dir(&error_file).context("creating error directory")?; - } - error_file.push(page.name.replace('/', "%2F")); - error_file.set_extension("html"); - - fs::write(&error_file, &page.article_body.html).context("writing error file")?; - - if !msg.is_empty() { - bail!("panic occurred while processing html (saved to {error_file:?}): {msg}"); - } else { - bail!("panic occurred while processing html (saved to {error_file:?})"); - } - } - Err(e) => bail!(e), - } - }; - let article_dir = create_article_dir(&base, page, redirects)?; // Write html to determined file. @@ -311,11 +334,11 @@ fn write( filename.push(&page.in_language.identifier); filename.set_extension("html"); - debug!("{:?}: {:?}", page.name, filename); - - if filename.exists() { - debug!("Overwriting existing file"); - } + debug!( + file = filename.to_string_lossy().as_ref(), + exists = filename.exists(), + "Writing article" + ); let mut file = File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?; diff --git a/src/lib.rs b/src/lib.rs index 841a84e..06e0f91 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,57 +1,52 @@ -use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr}; +use std::{ + io::{self, BufRead}, + str::FromStr, +}; #[macro_use] extern crate log; -use anyhow::Context; pub mod html; pub mod osm; mod tag_file; pub use tag_file::*; +pub mod extend; pub mod wm; use wm::{Qid, Title}; /// Read from a file of urls on each line. -pub fn parse_wikidata_file(path: impl AsRef) -> anyhow::Result> { - let contents = fs::read_to_string(path.as_ref())?; - Ok(contents - .lines() - .enumerate() - .map(|(i, line)| { - Qid::from_str(line).with_context(|| { - let line_num = i + 1; - format!("on line {line_num}: {line:?}") - }) - }) - .filter_map(|r| match r { - Ok(qid) => Some(qid), +pub fn parse_wikidata_file(r: impl BufRead, collection: &mut impl Extend) -> io::Result<()> { + for (i, line) in r.lines().enumerate() { + let line = line?; + match Qid::from_str(&line) { + Ok(qid) => collection.extend(Some(qid)), Err(e) => { - warn!("Could not parse QID: {:#}", e); - None + let line_num = i + 1; + warn!("Could not parse QID: on line {line_num}: {line:?}: {:#}", e); } - }) - .collect()) + } + } + Ok(()) } /// Read article titles from a file of urls on each line. -pub fn parse_wikipedia_file(path: impl AsRef) -> anyhow::Result> { - let contents = fs::read_to_string(path.as_ref())?; - Ok(contents - .lines() - .enumerate() - .map(|(i, line)| { - Title::from_url(line).with_context(|| { - let line_num = i + 1; - format!("on line {line_num}: {line:?}") - }) - }) - .filter_map(|r| match r { - Ok(qid) => Some(qid), +pub fn parse_wikipedia_file( + r: impl BufRead, + collection: &mut impl Extend, +) -> io::Result<()> { + for (i, line) in r.lines().enumerate() { + let line = line?; + match Title::from_osm_tag(&line) { + Ok(title) => collection.extend(Some(title)), Err(e) => { - warn!("Could not parse wikipedia title: {:#}", e); - None + let line_num = i + 1; + warn!( + "Could not parse wikipedia title: on line {line_num}: {line:?}: {:#}", + e + ); } - }) - .collect()) + } + } + Ok(()) } diff --git a/src/main.rs b/src/main.rs index 9c125a9..5ac7301 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,7 +15,7 @@ use anyhow::Context; use clap::{CommandFactory, Parser, Subcommand}; #[macro_use] extern crate tracing; -use tracing_subscriber::filter::EnvFilter; +use tracing_subscriber::{filter::EnvFilter, Layer}; use om_wikiparser::osm; @@ -77,13 +77,7 @@ enum Cmd { } fn main() -> anyhow::Result<()> { - // Use info level by default, load overrides from `RUST_LOG` env variable. - // See https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html - tracing_subscriber::fmt() - .with_env_filter(EnvFilter::from_default_env()) - .compact() - .with_writer(stderr) - .init(); + init_logger(); let args = Args::parse(); @@ -126,7 +120,8 @@ fn main() -> anyhow::Result<()> { let mut titles = HashSet::new(); let mut errors = Vec::new(); info!("Reading osm tag file"); - om_wikiparser::parse_osm_tag_file(osm_tags, &mut qids, &mut titles, Some(&mut errors))?; + let file = File::open(osm_tags)?; + om_wikiparser::parse_osm_tag_file(file, &mut qids, &mut titles, &mut errors)?; info!("Found {} errors in tag file", errors.len()); let mut writer = csv::WriterBuilder::new() @@ -215,6 +210,23 @@ fn main() -> anyhow::Result<()> { } } +fn init_logger() { + use tracing::dispatcher::{self, Dispatch}; + use tracing_subscriber::{layer::SubscriberExt, Registry}; + + let subscriber = Registry::default().with( + tracing_logfmt::builder() + .layer() + .with_writer(stderr) + // Use info level by default, load overrides from `RUST_LOG` env variable. + // See https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html + .with_filter(EnvFilter::from_default_env()), + ); + + dispatcher::set_global_default(Dispatch::new(subscriber)) + .expect("Global logger has already been set!"); +} + /// Determine the number of threads to use. /// /// If `requested` is <= 0, then the number of cores plus `requested` will be created. diff --git a/src/tag_file.rs b/src/tag_file.rs index 93d6415..0111f0e 100644 --- a/src/tag_file.rs +++ b/src/tag_file.rs @@ -1,4 +1,4 @@ -use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, str::FromStr}; +use std::{error::Error, fmt::Display, io::Read, str::FromStr}; use anyhow::{anyhow, bail}; @@ -9,19 +9,15 @@ use crate::{ /// Read a TSV file of OSM tags, using wikipedia/wikidata tags. pub fn parse_osm_tag_file( - path: impl AsRef<OsStr>, - qids: &mut HashSet<Qid>, - titles: &mut HashSet<Title>, - mut line_errors: Option<&mut Vec<ParseLineError>>, + r: impl Read, + qids: &mut impl Extend<Qid>, + titles: &mut impl Extend<Title>, + line_errors: &mut impl Extend<ParseLineError>, ) -> anyhow::Result<()> { - let path = path.as_ref(); - let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?; + let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_reader(r); let mut push_error = |e: ParseLineError| { - debug!("Tag parse error: {e}"); - if let Some(ref mut errs) = line_errors { - errs.push(e); - } + line_errors.extend(Some(e)); }; let mut qid_col = None; @@ -84,7 +80,7 @@ pub fn parse_osm_tag_file( if !qid.is_empty() { match Qid::from_str(qid) { Ok(qid) => { - qids.insert(qid); + qids.extend(Some(qid)); } Err(e) => { let (osm_id, osm_type, osm_version) = parse_metadata(); @@ -104,7 +100,7 @@ pub fn parse_osm_tag_file( if !title.is_empty() { match Title::from_osm_tag(title) { Ok(title) => { - titles.insert(title); + titles.extend(Some(title)); } Err(e) => { let (osm_id, osm_type, osm_version) = parse_metadata();