Add option to dump input json to stdout

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-11-16 13:48:09 -05:00 committed by Evan Lloyd New-Schmidt
parent cb835fcbc6
commit 1f7d0695e2
2 changed files with 61 additions and 40 deletions

View file

@ -1,6 +1,7 @@
use std::{
borrow::Cow,
fs::{self, File},
io::{stdin, BufRead, Write},
io::{stdin, stdout, BufRead, Write},
os::unix,
path::{Path, PathBuf},
};
@ -13,13 +14,28 @@ use om_wikiparser::{
wm::{Page, Title},
};
#[derive(clap::ValueEnum, Copy, Clone)]
pub enum ArticleFilter {
/// All articles that match on title/QID
Match,
/// Articles that cannot be simplified
Error,
/// Articles that cause panics when simplified
Panic, // FIXME: move panic dumping to this
}
/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
///
/// Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
#[derive(clap::Args)]
pub struct Args {
/// Directory to write the extracted articles to.
pub output_dir: PathBuf,
#[arg(required_unless_present = "passthrough")]
pub output_dir: Option<PathBuf>,
/// Copy input article JSON to stdout if it matches certain criteria.
#[arg(long)]
pub passthrough: Option<ArticleFilter>,
/// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
///
@ -103,10 +119,14 @@ pub fn run(args: Args) -> anyhow::Result<()> {
.map(|p| File::options().create(true).append(true).open(p))
.transpose()?;
if !args.output_dir.is_dir() {
bail!("output dir {:?} does not exist", args.output_dir)
if let Some(output_dir) = &args.output_dir {
if !output_dir.is_dir() {
bail!("output dir {:?} does not exist", output_dir);
}
}
let mut stdout = stdout();
info!("Processing dump");
let mut dump = stdin().lock();
@ -179,8 +199,36 @@ pub fn run(args: Args) -> anyhow::Result<()> {
}
}
if let Err(e) = write(&args.output_dir, &page, matching_titles, !args.no_simplify) {
error!("Error writing article: {:#}", e);
// Always write regardless of later errors.
if let Some(ArticleFilter::Match) = args.passthrough {
stdout.write_all(buffer.as_bytes())?;
}
let article_output = if args.no_simplify {
Ok(Cow::Borrowed(&page.article_body.html))
} else {
html::process_str(&page.article_body.html, &page.in_language.identifier).map(Cow::Owned)
};
match article_output {
Err(e) => {
error!("Error processing article: {:#}", e);
if let Some(filter) = args.passthrough {
match (e, filter) {
(_, ArticleFilter::Error) | (HtmlError::Panic(_), ArticleFilter::Panic) => {
stdout.write_all(buffer.as_bytes())?
}
_ => {}
}
}
}
Ok(html) => {
if let Some(output_dir) = args.output_dir.as_ref() {
if let Err(e) = write(output_dir, &page, matching_titles, &html) {
error!("Error writing article: {:#}", e);
}
}
}
}
}
@ -275,35 +323,8 @@ fn write(
base: impl AsRef<Path>,
page: &Page,
redirects: impl IntoIterator<Item = Title>,
simplify: bool,
html: &str,
) -> anyhow::Result<()> {
let html = if !simplify {
page.article_body.html.to_string()
} else {
match html::process_str(&page.article_body.html, &page.in_language.identifier) {
Ok(html) => html,
Err(HtmlError::Panic(msg)) => {
// Write original article text to disk
let mut error_file = base.as_ref().to_path_buf();
error_file.push("errors");
if !error_file.exists() {
fs::create_dir(&error_file).context("creating error directory")?;
}
error_file.push(page.name.replace('/', "%2F"));
error_file.set_extension("html");
fs::write(&error_file, &page.article_body.html).context("writing error file")?;
if !msg.is_empty() {
bail!("panic occurred while processing html (saved to {error_file:?}): {msg}");
} else {
bail!("panic occurred while processing html (saved to {error_file:?})");
}
}
Err(e) => bail!(e),
}
};
let article_dir = create_article_dir(&base, page, redirects)?;
// Write html to determined file.
@ -311,11 +332,11 @@ fn write(
filename.push(&page.in_language.identifier);
filename.set_extension("html");
debug!("{:?}: {:?}", page.name, filename);
if filename.exists() {
debug!("Overwriting existing file");
}
debug!(
file = filename.to_string_lossy().as_ref(),
exists = filename.exists(),
"Writing article"
);
let mut file =
File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;

View file

@ -41,7 +41,7 @@ pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<T
.lines()
.enumerate()
.map(|(i, line)| {
Title::from_url(line).with_context(|| {
Title::from_osm_tag(line).with_context(|| {
let line_num = i + 1;
format!("on line {line_num}: {line:?}")
})