Add option to dump input json to stdout
Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
cb835fcbc6
commit
1f7d0695e2
2 changed files with 61 additions and 40 deletions
|
@ -1,6 +1,7 @@
|
|||
use std::{
|
||||
borrow::Cow,
|
||||
fs::{self, File},
|
||||
io::{stdin, BufRead, Write},
|
||||
io::{stdin, stdout, BufRead, Write},
|
||||
os::unix,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
@ -13,13 +14,28 @@ use om_wikiparser::{
|
|||
wm::{Page, Title},
|
||||
};
|
||||
|
||||
#[derive(clap::ValueEnum, Copy, Clone)]
|
||||
pub enum ArticleFilter {
|
||||
/// All articles that match on title/QID
|
||||
Match,
|
||||
/// Articles that cannot be simplified
|
||||
Error,
|
||||
/// Articles that cause panics when simplified
|
||||
Panic, // FIXME: move panic dumping to this
|
||||
}
|
||||
|
||||
/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
|
||||
///
|
||||
/// Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
|
||||
#[derive(clap::Args)]
|
||||
pub struct Args {
|
||||
/// Directory to write the extracted articles to.
|
||||
pub output_dir: PathBuf,
|
||||
#[arg(required_unless_present = "passthrough")]
|
||||
pub output_dir: Option<PathBuf>,
|
||||
|
||||
/// Copy input article JSON to stdout if it matches certain criteria.
|
||||
#[arg(long)]
|
||||
pub passthrough: Option<ArticleFilter>,
|
||||
|
||||
/// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
|
||||
///
|
||||
|
@ -103,10 +119,14 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
.map(|p| File::options().create(true).append(true).open(p))
|
||||
.transpose()?;
|
||||
|
||||
if !args.output_dir.is_dir() {
|
||||
bail!("output dir {:?} does not exist", args.output_dir)
|
||||
if let Some(output_dir) = &args.output_dir {
|
||||
if !output_dir.is_dir() {
|
||||
bail!("output dir {:?} does not exist", output_dir);
|
||||
}
|
||||
}
|
||||
|
||||
let mut stdout = stdout();
|
||||
|
||||
info!("Processing dump");
|
||||
let mut dump = stdin().lock();
|
||||
|
||||
|
@ -179,8 +199,36 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
}
|
||||
}
|
||||
|
||||
if let Err(e) = write(&args.output_dir, &page, matching_titles, !args.no_simplify) {
|
||||
error!("Error writing article: {:#}", e);
|
||||
// Always write regardless of later errors.
|
||||
if let Some(ArticleFilter::Match) = args.passthrough {
|
||||
stdout.write_all(buffer.as_bytes())?;
|
||||
}
|
||||
|
||||
let article_output = if args.no_simplify {
|
||||
Ok(Cow::Borrowed(&page.article_body.html))
|
||||
} else {
|
||||
html::process_str(&page.article_body.html, &page.in_language.identifier).map(Cow::Owned)
|
||||
};
|
||||
|
||||
match article_output {
|
||||
Err(e) => {
|
||||
error!("Error processing article: {:#}", e);
|
||||
if let Some(filter) = args.passthrough {
|
||||
match (e, filter) {
|
||||
(_, ArticleFilter::Error) | (HtmlError::Panic(_), ArticleFilter::Panic) => {
|
||||
stdout.write_all(buffer.as_bytes())?
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(html) => {
|
||||
if let Some(output_dir) = args.output_dir.as_ref() {
|
||||
if let Err(e) = write(output_dir, &page, matching_titles, &html) {
|
||||
error!("Error writing article: {:#}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -275,35 +323,8 @@ fn write(
|
|||
base: impl AsRef<Path>,
|
||||
page: &Page,
|
||||
redirects: impl IntoIterator<Item = Title>,
|
||||
simplify: bool,
|
||||
html: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
let html = if !simplify {
|
||||
page.article_body.html.to_string()
|
||||
} else {
|
||||
match html::process_str(&page.article_body.html, &page.in_language.identifier) {
|
||||
Ok(html) => html,
|
||||
Err(HtmlError::Panic(msg)) => {
|
||||
// Write original article text to disk
|
||||
let mut error_file = base.as_ref().to_path_buf();
|
||||
error_file.push("errors");
|
||||
if !error_file.exists() {
|
||||
fs::create_dir(&error_file).context("creating error directory")?;
|
||||
}
|
||||
error_file.push(page.name.replace('/', "%2F"));
|
||||
error_file.set_extension("html");
|
||||
|
||||
fs::write(&error_file, &page.article_body.html).context("writing error file")?;
|
||||
|
||||
if !msg.is_empty() {
|
||||
bail!("panic occurred while processing html (saved to {error_file:?}): {msg}");
|
||||
} else {
|
||||
bail!("panic occurred while processing html (saved to {error_file:?})");
|
||||
}
|
||||
}
|
||||
Err(e) => bail!(e),
|
||||
}
|
||||
};
|
||||
|
||||
let article_dir = create_article_dir(&base, page, redirects)?;
|
||||
|
||||
// Write html to determined file.
|
||||
|
@ -311,11 +332,11 @@ fn write(
|
|||
filename.push(&page.in_language.identifier);
|
||||
filename.set_extension("html");
|
||||
|
||||
debug!("{:?}: {:?}", page.name, filename);
|
||||
|
||||
if filename.exists() {
|
||||
debug!("Overwriting existing file");
|
||||
}
|
||||
debug!(
|
||||
file = filename.to_string_lossy().as_ref(),
|
||||
exists = filename.exists(),
|
||||
"Writing article"
|
||||
);
|
||||
|
||||
let mut file =
|
||||
File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;
|
||||
|
|
|
@ -41,7 +41,7 @@ pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<T
|
|||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
Title::from_url(line).with_context(|| {
|
||||
Title::from_osm_tag(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("on line {line_num}: {line:?}")
|
||||
})
|
||||
|
|
Loading…
Add table
Reference in a new issue