Debugging Improvements #39

Merged
newsch merged 3 commits from debugging into main 2024-04-28 18:20:52 +00:00
7 changed files with 245 additions and 115 deletions

73
Cargo.lock generated
View file

@ -301,6 +301,15 @@ dependencies = [
"memchr",
]
[[package]]
name = "deranged"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
dependencies = [
"powerfmt",
]
[[package]]
name = "derive_more"
version = "0.99.17"
@ -679,6 +688,12 @@ dependencies = [
"winapi",
]
[[package]]
name = "num-conv"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
[[package]]
name = "num_cpus"
version = "1.16.0"
@ -718,6 +733,7 @@ dependencies = [
"serde_json",
"thiserror",
"tracing",
"tracing-logfmt",
"tracing-subscriber",
"unicode-normalization",
"url",
@ -877,6 +893,12 @@ version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"
[[package]]
name = "powerfmt"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "ppv-lite86"
version = "0.2.17"
@ -1214,18 +1236,18 @@ checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
[[package]]
name = "serde"
version = "1.0.163"
version = "1.0.193"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
checksum = "25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.163"
version = "1.0.193"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3"
dependencies = [
"proc-macro2",
"quote",
@ -1388,6 +1410,37 @@ dependencies = [
"once_cell",
]
[[package]]
name = "time"
version = "0.3.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749"
dependencies = [
"deranged",
"itoa",
"num-conv",
"powerfmt",
"serde",
"time-core",
"time-macros",
]
[[package]]
name = "time-core"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
[[package]]
name = "time-macros"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774"
dependencies = [
"num-conv",
"time-core",
]
[[package]]
name = "tinyvec"
version = "1.6.0"
@ -1447,6 +1500,18 @@ dependencies = [
"tracing-core",
]
[[package]]
name = "tracing-logfmt"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22b8e455f6caa5212a102ec530bf86b8dc5a4c536299bffd84b238fed9119be7"
dependencies = [
"time",
"tracing",
"tracing-core",
"tracing-subscriber",
]
[[package]]
name = "tracing-subscriber"
version = "0.3.17"

View file

@ -25,6 +25,7 @@ serde_json = "1.0.96"
thiserror = "1.0.44"
tracing = "0.1.37"
tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
tracing-logfmt = "0.3.4"
url = "2.3.1"
urlencoding = "2.1.2"

38
src/extend.rs Normal file
View file

@ -0,0 +1,38 @@
//! Utilities for working with [Extend].
use std::iter::Extend;
/// Calls `f` for each `Item`.
///
/// ```
/// # use om_wikiparser::extend;
/// let mut count = 0;
///
/// extend::from_fn(|_| count += 1).extend(std::iter::zip(
/// [1, 2, 3, 4],
/// ['a', 'b', 'c']));
/// assert_eq!(count, 3);
/// ```
pub fn from_fn<Item, F: FnMut(Item)>(f: F) -> FromFn<F> {
FromFn(f)
}
pub struct FromFn<F>(F);
impl<Item, F: FnMut(Item)> Extend<Item> for FromFn<F> {
fn extend<T: IntoIterator<Item = Item>>(&mut self, iter: T) {
for item in iter {
self.0(item);
}
}
}
/// Iterates but drops each `Item`.
pub fn sink() -> Sink {
Sink(())
}
pub struct Sink(());
impl<Item> Extend<Item> for Sink {
fn extend<T: IntoIterator<Item = Item>>(&mut self, iter: T) {
for _item in iter {}
}
}

View file

@ -1,6 +1,8 @@
use std::{
borrow::Cow,
collections::HashSet,
fs::{self, File},
io::{stdin, BufRead, Write},
io::{stdin, stdout, BufRead, BufReader, Write},
os::unix,
path::{Path, PathBuf},
};
@ -8,18 +10,34 @@ use std::{
use anyhow::{anyhow, bail, Context};
use om_wikiparser::{
extend,
html::{self, HtmlError},
parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file,
wm::{Page, Title},
};
#[derive(clap::ValueEnum, Copy, Clone)]
pub enum ArticleFilter {
/// All articles that match on title/QID
Match,
/// Articles that cannot be simplified
Error,
/// Articles that cause panics when simplified
Panic, // FIXME: move panic dumping to this
}
/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
///
/// Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
#[derive(clap::Args)]
pub struct Args {
/// Directory to write the extracted articles to.
pub output_dir: PathBuf,
#[arg(required_unless_present = "passthrough")]
pub output_dir: Option<PathBuf>,
/// Copy input article JSON to stdout if it matches certain criteria.
#[arg(long)]
pub passthrough: Option<ArticleFilter>,
/// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
///
@ -51,34 +69,34 @@ pub struct Args {
}
pub fn run(args: Args) -> anyhow::Result<()> {
let mut wikipedia_titles = if let Some(path) = args.wikipedia_urls {
let mut wikipedia_titles = HashSet::new();
if let Some(path) = args.wikipedia_urls {
info!("Loading article urls from {path:?}");
parse_wikipedia_file(path)?
} else {
Default::default()
};
let file = BufReader::new(File::open(path)?);
parse_wikipedia_file(file, &mut wikipedia_titles)?
}
let mut wikidata_qids = if let Some(path) = args.wikidata_qids {
let mut wikidata_qids = HashSet::new();
if let Some(path) = args.wikidata_qids {
info!("Loading wikidata QIDs from {path:?}");
parse_wikidata_file(path)?
} else {
Default::default()
let file = BufReader::new(File::open(path)?);
parse_wikidata_file(file, &mut wikidata_qids)?
};
if let Some(ref path) = args.osm_tags {
info!("Loading wikipedia/wikidata osm tags from {path:?}");
let file = File::open(path)?;
let original_items = wikidata_qids.len() + wikipedia_titles.len();
let mut line_errors = Vec::new();
let mut error_count = 0;
parse_osm_tag_file(
path,
file,
&mut wikidata_qids,
&mut wikipedia_titles,
Some(&mut line_errors),
&mut extend::from_fn(|_| error_count += 1),
)?;
if !line_errors.is_empty() {
let error_count = line_errors.len();
if error_count != 0 {
let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items;
let percentage = 100.0 * error_count as f64 / new_items as f64;
warn!("{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",);
@ -103,10 +121,14 @@ pub fn run(args: Args) -> anyhow::Result<()> {
.map(|p| File::options().create(true).append(true).open(p))
.transpose()?;
if !args.output_dir.is_dir() {
bail!("output dir {:?} does not exist", args.output_dir)
if let Some(output_dir) = &args.output_dir {
if !output_dir.is_dir() {
bail!("output dir {:?} does not exist", output_dir);
}
}
let mut stdout = stdout();
info!("Processing dump");
let mut dump = stdin().lock();
@ -179,8 +201,36 @@ pub fn run(args: Args) -> anyhow::Result<()> {
}
}
if let Err(e) = write(&args.output_dir, &page, matching_titles, !args.no_simplify) {
error!("Error writing article: {:#}", e);
// Always write regardless of later errors.
if let Some(ArticleFilter::Match) = args.passthrough {
stdout.write_all(buffer.as_bytes())?;
}
let article_output = if args.no_simplify {
Ok(Cow::Borrowed(&page.article_body.html))
} else {
html::process_str(&page.article_body.html, &page.in_language.identifier).map(Cow::Owned)
};
match article_output {
Err(e) => {
error!("Error processing article: {:#}", e);
if let Some(filter) = args.passthrough {
match (e, filter) {
(_, ArticleFilter::Error) | (HtmlError::Panic(_), ArticleFilter::Panic) => {
stdout.write_all(buffer.as_bytes())?
}
_ => {}
}
}
}
Ok(html) => {
if let Some(output_dir) = args.output_dir.as_ref() {
if let Err(e) = write(output_dir, &page, matching_titles, &html) {
error!("Error writing article: {:#}", e);
}
}
}
}
}
@ -275,35 +325,8 @@ fn write(
base: impl AsRef<Path>,
page: &Page,
redirects: impl IntoIterator<Item = Title>,
simplify: bool,
html: &str,
) -> anyhow::Result<()> {
let html = if !simplify {
page.article_body.html.to_string()
} else {
match html::process_str(&page.article_body.html, &page.in_language.identifier) {
Ok(html) => html,
Err(HtmlError::Panic(msg)) => {
// Write original article text to disk
let mut error_file = base.as_ref().to_path_buf();
error_file.push("errors");
if !error_file.exists() {
fs::create_dir(&error_file).context("creating error directory")?;
}
error_file.push(page.name.replace('/', "%2F"));
error_file.set_extension("html");
fs::write(&error_file, &page.article_body.html).context("writing error file")?;
if !msg.is_empty() {
bail!("panic occurred while processing html (saved to {error_file:?}): {msg}");
} else {
bail!("panic occurred while processing html (saved to {error_file:?})");
}
}
Err(e) => bail!(e),
}
};
let article_dir = create_article_dir(&base, page, redirects)?;
// Write html to determined file.
@ -311,11 +334,11 @@ fn write(
filename.push(&page.in_language.identifier);
filename.set_extension("html");
debug!("{:?}: {:?}", page.name, filename);
if filename.exists() {
debug!("Overwriting existing file");
}
debug!(
file = filename.to_string_lossy().as_ref(),
exists = filename.exists(),
"Writing article"
);
let mut file =
File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;

View file

@ -1,57 +1,52 @@
use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr};
use std::{
io::{self, BufRead},
str::FromStr,
};
#[macro_use]
extern crate log;
use anyhow::Context;
pub mod html;
pub mod osm;
mod tag_file;
pub use tag_file::*;
pub mod extend;
pub mod wm;
use wm::{Qid, Title};
/// Read from a file of urls on each line.
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
let contents = fs::read_to_string(path.as_ref())?;
Ok(contents
.lines()
.enumerate()
.map(|(i, line)| {
Qid::from_str(line).with_context(|| {
let line_num = i + 1;
format!("on line {line_num}: {line:?}")
})
})
.filter_map(|r| match r {
Ok(qid) => Some(qid),
pub fn parse_wikidata_file(r: impl BufRead, collection: &mut impl Extend<Qid>) -> io::Result<()> {
for (i, line) in r.lines().enumerate() {
let line = line?;
match Qid::from_str(&line) {
Ok(qid) => collection.extend(Some(qid)),
Err(e) => {
warn!("Could not parse QID: {:#}", e);
None
let line_num = i + 1;
warn!("Could not parse QID: on line {line_num}: {line:?}: {:#}", e);
}
})
.collect())
}
}
Ok(())
}
/// Read article titles from a file of urls on each line.
pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
let contents = fs::read_to_string(path.as_ref())?;
Ok(contents
.lines()
.enumerate()
.map(|(i, line)| {
Title::from_url(line).with_context(|| {
let line_num = i + 1;
format!("on line {line_num}: {line:?}")
})
})
.filter_map(|r| match r {
Ok(qid) => Some(qid),
pub fn parse_wikipedia_file(
r: impl BufRead,
collection: &mut impl Extend<Title>,
) -> io::Result<()> {
for (i, line) in r.lines().enumerate() {
let line = line?;
match Title::from_osm_tag(&line) {
Ok(title) => collection.extend(Some(title)),
Err(e) => {
warn!("Could not parse wikipedia title: {:#}", e);
None
let line_num = i + 1;
warn!(
"Could not parse wikipedia title: on line {line_num}: {line:?}: {:#}",
e
);
}
})
.collect())
}
}
Ok(())
}

View file

@ -15,7 +15,7 @@ use anyhow::Context;
use clap::{CommandFactory, Parser, Subcommand};
#[macro_use]
extern crate tracing;
use tracing_subscriber::filter::EnvFilter;
use tracing_subscriber::{filter::EnvFilter, Layer};
use om_wikiparser::osm;
@ -77,13 +77,7 @@ enum Cmd {
}
fn main() -> anyhow::Result<()> {
// Use info level by default, load overrides from `RUST_LOG` env variable.
// See https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html
tracing_subscriber::fmt()
.with_env_filter(EnvFilter::from_default_env())
.compact()
.with_writer(stderr)
.init();
init_logger();
let args = Args::parse();
@ -126,7 +120,8 @@ fn main() -> anyhow::Result<()> {
let mut titles = HashSet::new();
let mut errors = Vec::new();
info!("Reading osm tag file");
om_wikiparser::parse_osm_tag_file(osm_tags, &mut qids, &mut titles, Some(&mut errors))?;
let file = File::open(osm_tags)?;
om_wikiparser::parse_osm_tag_file(file, &mut qids, &mut titles, &mut errors)?;
info!("Found {} errors in tag file", errors.len());
let mut writer = csv::WriterBuilder::new()
@ -215,6 +210,23 @@ fn main() -> anyhow::Result<()> {
}
}
fn init_logger() {
use tracing::dispatcher::{self, Dispatch};
use tracing_subscriber::{layer::SubscriberExt, Registry};
let subscriber = Registry::default().with(
tracing_logfmt::builder()
.layer()
.with_writer(stderr)
// Use info level by default, load overrides from `RUST_LOG` env variable.
// See https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html
.with_filter(EnvFilter::from_default_env()),
);
dispatcher::set_global_default(Dispatch::new(subscriber))
.expect("Global logger has already been set!");
}
/// Determine the number of threads to use.
///
/// If `requested` is <= 0, then the number of cores plus `requested` will be created.

View file

@ -1,4 +1,4 @@
use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, str::FromStr};
use std::{error::Error, fmt::Display, io::Read, str::FromStr};
use anyhow::{anyhow, bail};
@ -9,19 +9,15 @@ use crate::{
/// Read a TSV file of OSM tags, using wikipedia/wikidata tags.
pub fn parse_osm_tag_file(
path: impl AsRef<OsStr>,
qids: &mut HashSet<Qid>,
titles: &mut HashSet<Title>,
mut line_errors: Option<&mut Vec<ParseLineError>>,
r: impl Read,
qids: &mut impl Extend<Qid>,
titles: &mut impl Extend<Title>,
line_errors: &mut impl Extend<ParseLineError>,
) -> anyhow::Result<()> {
let path = path.as_ref();
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_reader(r);
let mut push_error = |e: ParseLineError| {
debug!("Tag parse error: {e}");
if let Some(ref mut errs) = line_errors {
errs.push(e);
}
line_errors.extend(Some(e));
};
let mut qid_col = None;
@ -84,7 +80,7 @@ pub fn parse_osm_tag_file(
if !qid.is_empty() {
match Qid::from_str(qid) {
Ok(qid) => {
qids.insert(qid);
qids.extend(Some(qid));
}
Err(e) => {
let (osm_id, osm_type, osm_version) = parse_metadata();
@ -104,7 +100,7 @@ pub fn parse_osm_tag_file(
if !title.is_empty() {
match Title::from_osm_tag(title) {
Ok(title) => {
titles.insert(title);
titles.extend(Some(title));
}
Err(e) => {
let (osm_id, osm_type, osm_version) = parse_metadata();