From 292eeac081b6380bc41f37fa7833f77218d0d9a4 Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Thu, 24 Aug 2023 15:42:19 -0400 Subject: [PATCH] Add command to write tag errors to file - Write a TSV file with the line number, error, and input text. - Include OSM object id if available in tag file. - Update run script to write file once before extracting. Signed-off-by: Evan Lloyd New-Schmidt --- README.md | 18 ++++++++++----- run.sh | 3 +++ src/main.rs | 59 +++++++++++++++++++++++++++++++++++++++++++++---- src/wm/mod.rs | 31 ++++++++++++++++++-------- src/wm/title.rs | 24 +++++++++++++------- 5 files changed, 109 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 666597d..1184142 100644 --- a/README.md +++ b/README.md @@ -65,25 +65,30 @@ Run the program with the `--help` flag to see all supported arguments. ``` $ cargo run --release -- --help -Extract articles from Wikipedia Enterprise HTML dumps +A set of tools to extract articles from Wikipedia Enterprise HTML dumps selected by OpenStreetMap tags. Usage: om-wikiparser Commands: - get-articles Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps get-tags Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump - simplify Apply the same html article simplification used when extracting articles to stdin, and write it to stdout + check-tags Attempt to parse extracted OSM tags and write errors to stdout in TSV format + get-articles Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps + simplify Apply html simplification to a single article help Print this message or the help of the given subcommand(s) Options: - -h, --help Print help (see more with '--help') - -V, --version Print version + -h, --help + Print help (see a summary with '-h') + + -V, --version + Print version ``` Each command has its own additional help: ``` $ cargo run -- get-articles --help + Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps. Expects an uncompressed dump (newline-delimited JSON) connected to stdin. @@ -100,6 +105,9 @@ Options: Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-qids` option to extract them from another language's dump. Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances. + --no-simplify + Don't process extracted HTML; write the original text to disk + -h, --help Print help (see a summary with '-h') diff --git a/run.sh b/run.sh index 6b56306..910f9ee 100755 --- a/run.sh +++ b/run.sh @@ -100,6 +100,9 @@ cd "$BUILD_DIR" log "Extracting tags from '$OSM_FILE'" "$wikiparser" get-tags "$OSM_FILE" > osm_tags.tsv +log "Writing tag parse errors to $BUILD_DIR/osm_tags_errors.tsv" +"$wikiparser" check-tags osm_tags.tsv > osm_tags_errors.tsv + # Enable backtraces in errors and panics. # NOTE: Backtraces are still printed for panics that are caught higher in the stack. # export RUST_BACKTRACE=1 diff --git a/src/main.rs b/src/main.rs index ea6d1fc..9804a83 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,5 @@ use std::{ + collections::HashSet, env, fs::File, io::{stdin, stdout, BufReader, Read, Write}, @@ -17,7 +18,7 @@ extern crate log; mod get_articles; mod get_tags; -/// Extract articles from Wikipedia Enterprise HTML dumps. +/// A set of tools to extract articles from Wikipedia Enterprise HTML dumps selected by OpenStreetMap tags. #[derive(Parser)] #[command(author, version, about, long_about, version = crate::version())] struct Args { @@ -27,11 +28,10 @@ struct Args { #[derive(Subcommand)] enum Cmd { - GetArticles(get_articles::Args), - /// Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump. /// /// Writes to stdout the extracted tags in a TSV format similar to `osmconvert --csv`. + /// Unlike `osmconvert`, this **does not** truncate long tag values and create invalid UTF-8. GetTags { /// The `.osm.pbf` file to use. pbf_file: PathBuf, @@ -46,8 +46,23 @@ enum Cmd { threads: Option, }, - /// Apply the same html article simplification used when extracting articles to stdin, and write it to stdout. + /// Attempt to parse extracted OSM tags and write errors to stdout in TSV format. + CheckTags { + /// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns. + /// + /// This can be generated with the `get-tags` command or `osmconvert --csv-headline --csv 'wikidata wikipedia'`. + #[arg(value_name = "FILE.tsv")] + osm_tags: PathBuf, + }, + + /// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps. /// + /// Expects an uncompressed dump (newline-delimited JSON) connected to stdin. + GetArticles(get_articles::Args), + + /// Apply html simplification to a single article. + /// + /// Reads from stdin and writes the simplified version to stdout. /// This is meant for testing and debugging. Simplify { /// The language to use when processing the article (defaults to `en`). @@ -97,6 +112,42 @@ fn main() -> anyhow::Result<()> { let pbf_file = File::open(pbf_file).map(BufReader::new)?; get_tags::run(pbf_file) } + Cmd::CheckTags { osm_tags } => { + let mut qids = HashSet::new(); + let mut titles = HashSet::new(); + let mut errors = Vec::new(); + info!("Reading osm tag file"); + om_wikiparser::wm::parse_osm_tag_file( + osm_tags, + &mut qids, + &mut titles, + Some(&mut errors), + )?; + info!("Found {} errors in tag file", errors.len()); + + let mut writer = csv::WriterBuilder::new() + .delimiter(b'\t') + .from_writer(stdout().lock()); + writer.write_record(["line", "kind", "osm_id", "error", "value"])?; + for error in errors { + use om_wikiparser::wm::ParseErrorKind::*; + let kind = error.kind.to_string(); + let id = error + .osm_id + .as_ref() + .map(ToString::to_string) + .unwrap_or_default(); + let e: anyhow::Error = match error.kind { + Title(e) => e.into(), + Qid(e) => e.into(), + Tsv(e) => e.into(), + }; + let msg = e.to_string(); + writer.write_record([&error.line.to_string(), &kind, &id, &msg, &error.text])?; + } + + Ok(()) + } Cmd::Simplify { lang } => { let mut input = String::new(); stdin().read_to_string(&mut input)?; diff --git a/src/wm/mod.rs b/src/wm/mod.rs index 530b41a..d8c3d89 100644 --- a/src/wm/mod.rs +++ b/src/wm/mod.rs @@ -72,10 +72,12 @@ pub fn parse_osm_tag_file( let mut qid_col = None; let mut title_col = None; + let mut osm_id_col = None; for (column, title) in rdr.headers()?.iter().enumerate() { match title { "wikidata" => qid_col = Some(column), "wikipedia" => title_col = Some(column), + "@id" => osm_id_col = Some(column), _ => (), } } @@ -97,12 +99,15 @@ pub fn parse_osm_tag_file( push_error(ParseLineError { text: String::new(), line: rdr.position().line(), + osm_id: None, kind: e.into(), }); continue; } } + let osm_id = osm_id_col.and_then(|i| row[i].parse().ok()); + let qid = &row[qid_col].trim(); if !qid.is_empty() { match Qid::from_str(qid) { @@ -112,6 +117,7 @@ pub fn parse_osm_tag_file( Err(e) => push_error(ParseLineError { text: qid.to_string(), line: rdr.position().line(), + osm_id, kind: e.into(), }), } @@ -126,6 +132,7 @@ pub fn parse_osm_tag_file( Err(e) => push_error(ParseLineError { text: title.to_string(), line: rdr.position().line(), + osm_id, kind: e.into(), }), } @@ -137,25 +144,31 @@ pub fn parse_osm_tag_file( #[derive(Debug, thiserror::Error)] pub enum ParseErrorKind { - #[error("bad title")] + #[error("title")] Title(#[from] ParseTitleError), - #[error("bad QID")] + #[error("QID")] Qid(#[from] ParseQidError), - #[error("bad TSV line")] + #[error("TSV line")] Tsv(#[from] csv::Error), } #[derive(Debug)] pub struct ParseLineError { - text: String, - line: u64, - kind: ParseErrorKind, + pub text: String, + pub line: u64, + pub osm_id: Option, + pub kind: ParseErrorKind, } impl Display for ParseLineError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // write source chain to ensure they are logged - write!(f, "on line {}: {:?}: {}", self.line, self.text, self.kind)?; + write!(f, "on line {}", self.line)?; + if let Some(osm_id) = self.osm_id { + write!(f, " ({osm_id})")?; + } + write!(f, ": {} {:?}", self.kind, self.text)?; + + // Write source error chain to ensure they are logged. let mut source = self.kind.source(); while let Some(e) = source { write!(f, ": {}", e)?; @@ -167,7 +180,7 @@ impl Display for ParseLineError { impl Error for ParseLineError { fn source(&self) -> Option<&(dyn Error + 'static)> { - // return nothing b/c Display prints source chain + // Return nothing because Display prints source chain. None } } diff --git a/src/wm/title.rs b/src/wm/title.rs index 5d0879a..eac49f3 100644 --- a/src/wm/title.rs +++ b/src/wm/title.rs @@ -48,7 +48,12 @@ impl Title { // https://en.wikipedia.org/wiki/Article_Title/More_Title pub fn from_url(url: &str) -> Result { - let url = Url::parse(url.trim())?; + let url = url.trim(); + if url.is_empty() { + return Err(ParseTitleError::Empty); + } + + let url = Url::parse(url)?; let (subdomain, host) = url .host_str() @@ -79,10 +84,11 @@ impl Title { // en:Article Title pub fn from_osm_tag(tag: &str) -> Result { - let (lang, title) = tag - .trim() - .split_once(':') - .ok_or(ParseTitleError::MissingColon)?; + let tag = tag.trim(); + if tag.is_empty() { + return Err(ParseTitleError::Empty); + } + let (lang, title) = tag.split_once(':').ok_or(ParseTitleError::MissingColon)?; let lang = lang.trim_start(); let title = title.trim_start(); @@ -125,9 +131,11 @@ impl Title { #[derive(Debug, PartialEq, Eq, thiserror::Error)] pub enum ParseTitleError { - #[error("title cannot be empty or whitespace")] + #[error("value is empty or whitespace")] + Empty, + #[error("title is empty or whitespace")] NoTitle, - #[error("lang cannot be empty or whitespace")] + #[error("lang is empty or whitespace")] NoLang, #[error("no ':' separating lang and title")] MissingColon, @@ -141,7 +149,7 @@ pub enum ParseTitleError { NoHost, #[error("no subdomain in url")] NoSubdomain, - #[error("url base domain is wikipedia.org")] + #[error("url base domain is not wikipedia.org")] BadDomain, #[error("url base path is not /wiki/")] BadPath,