Add command to write tag errors to file

- Write a TSV file with the line number, error, and input text.
- Include OSM object id if available in tag file.
- Update run script to write file once before extracting.

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-08-24 15:42:19 -04:00 committed by Evan Lloyd New-Schmidt
parent 218e55931f
commit 292eeac081
5 changed files with 109 additions and 26 deletions

View file

@ -65,25 +65,30 @@ Run the program with the `--help` flag to see all supported arguments.
```
$ cargo run --release -- --help
Extract articles from Wikipedia Enterprise HTML dumps
A set of tools to extract articles from Wikipedia Enterprise HTML dumps selected by OpenStreetMap tags.
Usage: om-wikiparser <COMMAND>
Commands:
get-articles Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps
get-tags Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump
simplify Apply the same html article simplification used when extracting articles to stdin, and write it to stdout
check-tags Attempt to parse extracted OSM tags and write errors to stdout in TSV format
get-articles Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps
simplify Apply html simplification to a single article
help Print this message or the help of the given subcommand(s)
Options:
-h, --help Print help (see more with '--help')
-V, --version Print version
-h, --help
Print help (see a summary with '-h')
-V, --version
Print version
```
Each command has its own additional help:
```
$ cargo run -- get-articles --help
Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
@ -100,6 +105,9 @@ Options:
Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-qids` option to extract them from another language's dump. Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
--no-simplify
Don't process extracted HTML; write the original text to disk
-h, --help
Print help (see a summary with '-h')

3
run.sh
View file

@ -100,6 +100,9 @@ cd "$BUILD_DIR"
log "Extracting tags from '$OSM_FILE'"
"$wikiparser" get-tags "$OSM_FILE" > osm_tags.tsv
log "Writing tag parse errors to $BUILD_DIR/osm_tags_errors.tsv"
"$wikiparser" check-tags osm_tags.tsv > osm_tags_errors.tsv
# Enable backtraces in errors and panics.
# NOTE: Backtraces are still printed for panics that are caught higher in the stack.
# export RUST_BACKTRACE=1

View file

@ -1,4 +1,5 @@
use std::{
collections::HashSet,
env,
fs::File,
io::{stdin, stdout, BufReader, Read, Write},
@ -17,7 +18,7 @@ extern crate log;
mod get_articles;
mod get_tags;
/// Extract articles from Wikipedia Enterprise HTML dumps.
/// A set of tools to extract articles from Wikipedia Enterprise HTML dumps selected by OpenStreetMap tags.
#[derive(Parser)]
#[command(author, version, about, long_about, version = crate::version())]
struct Args {
@ -27,11 +28,10 @@ struct Args {
#[derive(Subcommand)]
enum Cmd {
GetArticles(get_articles::Args),
/// Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump.
///
/// Writes to stdout the extracted tags in a TSV format similar to `osmconvert --csv`.
/// Unlike `osmconvert`, this **does not** truncate long tag values and create invalid UTF-8.
GetTags {
/// The `.osm.pbf` file to use.
pbf_file: PathBuf,
@ -46,8 +46,23 @@ enum Cmd {
threads: Option<isize>,
},
/// Apply the same html article simplification used when extracting articles to stdin, and write it to stdout.
/// Attempt to parse extracted OSM tags and write errors to stdout in TSV format.
CheckTags {
/// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
///
/// This can be generated with the `get-tags` command or `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
#[arg(value_name = "FILE.tsv")]
osm_tags: PathBuf,
},
/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
///
/// Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
GetArticles(get_articles::Args),
/// Apply html simplification to a single article.
///
/// Reads from stdin and writes the simplified version to stdout.
/// This is meant for testing and debugging.
Simplify {
/// The language to use when processing the article (defaults to `en`).
@ -97,6 +112,42 @@ fn main() -> anyhow::Result<()> {
let pbf_file = File::open(pbf_file).map(BufReader::new)?;
get_tags::run(pbf_file)
}
Cmd::CheckTags { osm_tags } => {
let mut qids = HashSet::new();
let mut titles = HashSet::new();
let mut errors = Vec::new();
info!("Reading osm tag file");
om_wikiparser::wm::parse_osm_tag_file(
osm_tags,
&mut qids,
&mut titles,
Some(&mut errors),
)?;
info!("Found {} errors in tag file", errors.len());
let mut writer = csv::WriterBuilder::new()
.delimiter(b'\t')
.from_writer(stdout().lock());
writer.write_record(["line", "kind", "osm_id", "error", "value"])?;
for error in errors {
use om_wikiparser::wm::ParseErrorKind::*;
let kind = error.kind.to_string();
let id = error
.osm_id
.as_ref()
.map(ToString::to_string)
.unwrap_or_default();
let e: anyhow::Error = match error.kind {
Title(e) => e.into(),
Qid(e) => e.into(),
Tsv(e) => e.into(),
};
let msg = e.to_string();
writer.write_record([&error.line.to_string(), &kind, &id, &msg, &error.text])?;
}
Ok(())
}
Cmd::Simplify { lang } => {
let mut input = String::new();
stdin().read_to_string(&mut input)?;

View file

@ -72,10 +72,12 @@ pub fn parse_osm_tag_file(
let mut qid_col = None;
let mut title_col = None;
let mut osm_id_col = None;
for (column, title) in rdr.headers()?.iter().enumerate() {
match title {
"wikidata" => qid_col = Some(column),
"wikipedia" => title_col = Some(column),
"@id" => osm_id_col = Some(column),
_ => (),
}
}
@ -97,12 +99,15 @@ pub fn parse_osm_tag_file(
push_error(ParseLineError {
text: String::new(),
line: rdr.position().line(),
osm_id: None,
kind: e.into(),
});
continue;
}
}
let osm_id = osm_id_col.and_then(|i| row[i].parse().ok());
let qid = &row[qid_col].trim();
if !qid.is_empty() {
match Qid::from_str(qid) {
@ -112,6 +117,7 @@ pub fn parse_osm_tag_file(
Err(e) => push_error(ParseLineError {
text: qid.to_string(),
line: rdr.position().line(),
osm_id,
kind: e.into(),
}),
}
@ -126,6 +132,7 @@ pub fn parse_osm_tag_file(
Err(e) => push_error(ParseLineError {
text: title.to_string(),
line: rdr.position().line(),
osm_id,
kind: e.into(),
}),
}
@ -137,25 +144,31 @@ pub fn parse_osm_tag_file(
#[derive(Debug, thiserror::Error)]
pub enum ParseErrorKind {
#[error("bad title")]
#[error("title")]
Title(#[from] ParseTitleError),
#[error("bad QID")]
#[error("QID")]
Qid(#[from] ParseQidError),
#[error("bad TSV line")]
#[error("TSV line")]
Tsv(#[from] csv::Error),
}
#[derive(Debug)]
pub struct ParseLineError {
text: String,
line: u64,
kind: ParseErrorKind,
pub text: String,
pub line: u64,
pub osm_id: Option<usize>,
pub kind: ParseErrorKind,
}
impl Display for ParseLineError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// write source chain to ensure they are logged
write!(f, "on line {}: {:?}: {}", self.line, self.text, self.kind)?;
write!(f, "on line {}", self.line)?;
if let Some(osm_id) = self.osm_id {
write!(f, " ({osm_id})")?;
}
write!(f, ": {} {:?}", self.kind, self.text)?;
// Write source error chain to ensure they are logged.
let mut source = self.kind.source();
while let Some(e) = source {
write!(f, ": {}", e)?;
@ -167,7 +180,7 @@ impl Display for ParseLineError {
impl Error for ParseLineError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
// return nothing b/c Display prints source chain
// Return nothing because Display prints source chain.
None
}
}

View file

@ -48,7 +48,12 @@ impl Title {
// https://en.wikipedia.org/wiki/Article_Title/More_Title
pub fn from_url(url: &str) -> Result<Self, ParseTitleError> {
let url = Url::parse(url.trim())?;
let url = url.trim();
if url.is_empty() {
return Err(ParseTitleError::Empty);
}
let url = Url::parse(url)?;
let (subdomain, host) = url
.host_str()
@ -79,10 +84,11 @@ impl Title {
// en:Article Title
pub fn from_osm_tag(tag: &str) -> Result<Self, ParseTitleError> {
let (lang, title) = tag
.trim()
.split_once(':')
.ok_or(ParseTitleError::MissingColon)?;
let tag = tag.trim();
if tag.is_empty() {
return Err(ParseTitleError::Empty);
}
let (lang, title) = tag.split_once(':').ok_or(ParseTitleError::MissingColon)?;
let lang = lang.trim_start();
let title = title.trim_start();
@ -125,9 +131,11 @@ impl Title {
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
pub enum ParseTitleError {
#[error("title cannot be empty or whitespace")]
#[error("value is empty or whitespace")]
Empty,
#[error("title is empty or whitespace")]
NoTitle,
#[error("lang cannot be empty or whitespace")]
#[error("lang is empty or whitespace")]
NoLang,
#[error("no ':' separating lang and title")]
MissingColon,
@ -141,7 +149,7 @@ pub enum ParseTitleError {
NoHost,
#[error("no subdomain in url")]
NoSubdomain,
#[error("url base domain is wikipedia.org")]
#[error("url base domain is not wikipedia.org")]
BadDomain,
#[error("url base path is not /wiki/")]
BadPath,