Add command to write tag errors to file
- Write a TSV file with the line number, error, and input text. - Include OSM object id if available in tag file. - Update run script to write file once before extracting. Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
218e55931f
commit
292eeac081
5 changed files with 109 additions and 26 deletions
18
README.md
18
README.md
|
@ -65,25 +65,30 @@ Run the program with the `--help` flag to see all supported arguments.
|
|||
|
||||
```
|
||||
$ cargo run --release -- --help
|
||||
Extract articles from Wikipedia Enterprise HTML dumps
|
||||
A set of tools to extract articles from Wikipedia Enterprise HTML dumps selected by OpenStreetMap tags.
|
||||
|
||||
Usage: om-wikiparser <COMMAND>
|
||||
|
||||
Commands:
|
||||
get-articles Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps
|
||||
get-tags Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump
|
||||
simplify Apply the same html article simplification used when extracting articles to stdin, and write it to stdout
|
||||
check-tags Attempt to parse extracted OSM tags and write errors to stdout in TSV format
|
||||
get-articles Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps
|
||||
simplify Apply html simplification to a single article
|
||||
help Print this message or the help of the given subcommand(s)
|
||||
|
||||
Options:
|
||||
-h, --help Print help (see more with '--help')
|
||||
-V, --version Print version
|
||||
-h, --help
|
||||
Print help (see a summary with '-h')
|
||||
|
||||
-V, --version
|
||||
Print version
|
||||
```
|
||||
|
||||
Each command has its own additional help:
|
||||
|
||||
```
|
||||
$ cargo run -- get-articles --help
|
||||
|
||||
Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
|
||||
|
||||
Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
|
||||
|
@ -100,6 +105,9 @@ Options:
|
|||
|
||||
Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-qids` option to extract them from another language's dump. Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
|
||||
|
||||
--no-simplify
|
||||
Don't process extracted HTML; write the original text to disk
|
||||
|
||||
-h, --help
|
||||
Print help (see a summary with '-h')
|
||||
|
||||
|
|
3
run.sh
3
run.sh
|
@ -100,6 +100,9 @@ cd "$BUILD_DIR"
|
|||
log "Extracting tags from '$OSM_FILE'"
|
||||
"$wikiparser" get-tags "$OSM_FILE" > osm_tags.tsv
|
||||
|
||||
log "Writing tag parse errors to $BUILD_DIR/osm_tags_errors.tsv"
|
||||
"$wikiparser" check-tags osm_tags.tsv > osm_tags_errors.tsv
|
||||
|
||||
# Enable backtraces in errors and panics.
|
||||
# NOTE: Backtraces are still printed for panics that are caught higher in the stack.
|
||||
# export RUST_BACKTRACE=1
|
||||
|
|
59
src/main.rs
59
src/main.rs
|
@ -1,4 +1,5 @@
|
|||
use std::{
|
||||
collections::HashSet,
|
||||
env,
|
||||
fs::File,
|
||||
io::{stdin, stdout, BufReader, Read, Write},
|
||||
|
@ -17,7 +18,7 @@ extern crate log;
|
|||
mod get_articles;
|
||||
mod get_tags;
|
||||
|
||||
/// Extract articles from Wikipedia Enterprise HTML dumps.
|
||||
/// A set of tools to extract articles from Wikipedia Enterprise HTML dumps selected by OpenStreetMap tags.
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about, version = crate::version())]
|
||||
struct Args {
|
||||
|
@ -27,11 +28,10 @@ struct Args {
|
|||
|
||||
#[derive(Subcommand)]
|
||||
enum Cmd {
|
||||
GetArticles(get_articles::Args),
|
||||
|
||||
/// Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump.
|
||||
///
|
||||
/// Writes to stdout the extracted tags in a TSV format similar to `osmconvert --csv`.
|
||||
/// Unlike `osmconvert`, this **does not** truncate long tag values and create invalid UTF-8.
|
||||
GetTags {
|
||||
/// The `.osm.pbf` file to use.
|
||||
pbf_file: PathBuf,
|
||||
|
@ -46,8 +46,23 @@ enum Cmd {
|
|||
threads: Option<isize>,
|
||||
},
|
||||
|
||||
/// Apply the same html article simplification used when extracting articles to stdin, and write it to stdout.
|
||||
/// Attempt to parse extracted OSM tags and write errors to stdout in TSV format.
|
||||
CheckTags {
|
||||
/// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
|
||||
///
|
||||
/// This can be generated with the `get-tags` command or `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
|
||||
#[arg(value_name = "FILE.tsv")]
|
||||
osm_tags: PathBuf,
|
||||
},
|
||||
|
||||
/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
|
||||
///
|
||||
/// Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
|
||||
GetArticles(get_articles::Args),
|
||||
|
||||
/// Apply html simplification to a single article.
|
||||
///
|
||||
/// Reads from stdin and writes the simplified version to stdout.
|
||||
/// This is meant for testing and debugging.
|
||||
Simplify {
|
||||
/// The language to use when processing the article (defaults to `en`).
|
||||
|
@ -97,6 +112,42 @@ fn main() -> anyhow::Result<()> {
|
|||
let pbf_file = File::open(pbf_file).map(BufReader::new)?;
|
||||
get_tags::run(pbf_file)
|
||||
}
|
||||
Cmd::CheckTags { osm_tags } => {
|
||||
let mut qids = HashSet::new();
|
||||
let mut titles = HashSet::new();
|
||||
let mut errors = Vec::new();
|
||||
info!("Reading osm tag file");
|
||||
om_wikiparser::wm::parse_osm_tag_file(
|
||||
osm_tags,
|
||||
&mut qids,
|
||||
&mut titles,
|
||||
Some(&mut errors),
|
||||
)?;
|
||||
info!("Found {} errors in tag file", errors.len());
|
||||
|
||||
let mut writer = csv::WriterBuilder::new()
|
||||
.delimiter(b'\t')
|
||||
.from_writer(stdout().lock());
|
||||
writer.write_record(["line", "kind", "osm_id", "error", "value"])?;
|
||||
for error in errors {
|
||||
use om_wikiparser::wm::ParseErrorKind::*;
|
||||
let kind = error.kind.to_string();
|
||||
let id = error
|
||||
.osm_id
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or_default();
|
||||
let e: anyhow::Error = match error.kind {
|
||||
Title(e) => e.into(),
|
||||
Qid(e) => e.into(),
|
||||
Tsv(e) => e.into(),
|
||||
};
|
||||
let msg = e.to_string();
|
||||
writer.write_record([&error.line.to_string(), &kind, &id, &msg, &error.text])?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Cmd::Simplify { lang } => {
|
||||
let mut input = String::new();
|
||||
stdin().read_to_string(&mut input)?;
|
||||
|
|
|
@ -72,10 +72,12 @@ pub fn parse_osm_tag_file(
|
|||
|
||||
let mut qid_col = None;
|
||||
let mut title_col = None;
|
||||
let mut osm_id_col = None;
|
||||
for (column, title) in rdr.headers()?.iter().enumerate() {
|
||||
match title {
|
||||
"wikidata" => qid_col = Some(column),
|
||||
"wikipedia" => title_col = Some(column),
|
||||
"@id" => osm_id_col = Some(column),
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
@ -97,12 +99,15 @@ pub fn parse_osm_tag_file(
|
|||
push_error(ParseLineError {
|
||||
text: String::new(),
|
||||
line: rdr.position().line(),
|
||||
osm_id: None,
|
||||
kind: e.into(),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let osm_id = osm_id_col.and_then(|i| row[i].parse().ok());
|
||||
|
||||
let qid = &row[qid_col].trim();
|
||||
if !qid.is_empty() {
|
||||
match Qid::from_str(qid) {
|
||||
|
@ -112,6 +117,7 @@ pub fn parse_osm_tag_file(
|
|||
Err(e) => push_error(ParseLineError {
|
||||
text: qid.to_string(),
|
||||
line: rdr.position().line(),
|
||||
osm_id,
|
||||
kind: e.into(),
|
||||
}),
|
||||
}
|
||||
|
@ -126,6 +132,7 @@ pub fn parse_osm_tag_file(
|
|||
Err(e) => push_error(ParseLineError {
|
||||
text: title.to_string(),
|
||||
line: rdr.position().line(),
|
||||
osm_id,
|
||||
kind: e.into(),
|
||||
}),
|
||||
}
|
||||
|
@ -137,25 +144,31 @@ pub fn parse_osm_tag_file(
|
|||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ParseErrorKind {
|
||||
#[error("bad title")]
|
||||
#[error("title")]
|
||||
Title(#[from] ParseTitleError),
|
||||
#[error("bad QID")]
|
||||
#[error("QID")]
|
||||
Qid(#[from] ParseQidError),
|
||||
#[error("bad TSV line")]
|
||||
#[error("TSV line")]
|
||||
Tsv(#[from] csv::Error),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ParseLineError {
|
||||
text: String,
|
||||
line: u64,
|
||||
kind: ParseErrorKind,
|
||||
pub text: String,
|
||||
pub line: u64,
|
||||
pub osm_id: Option<usize>,
|
||||
pub kind: ParseErrorKind,
|
||||
}
|
||||
|
||||
impl Display for ParseLineError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// write source chain to ensure they are logged
|
||||
write!(f, "on line {}: {:?}: {}", self.line, self.text, self.kind)?;
|
||||
write!(f, "on line {}", self.line)?;
|
||||
if let Some(osm_id) = self.osm_id {
|
||||
write!(f, " ({osm_id})")?;
|
||||
}
|
||||
write!(f, ": {} {:?}", self.kind, self.text)?;
|
||||
|
||||
// Write source error chain to ensure they are logged.
|
||||
let mut source = self.kind.source();
|
||||
while let Some(e) = source {
|
||||
write!(f, ": {}", e)?;
|
||||
|
@ -167,7 +180,7 @@ impl Display for ParseLineError {
|
|||
|
||||
impl Error for ParseLineError {
|
||||
fn source(&self) -> Option<&(dyn Error + 'static)> {
|
||||
// return nothing b/c Display prints source chain
|
||||
// Return nothing because Display prints source chain.
|
||||
None
|
||||
}
|
||||
}
|
||||
|
|
|
@ -48,7 +48,12 @@ impl Title {
|
|||
|
||||
// https://en.wikipedia.org/wiki/Article_Title/More_Title
|
||||
pub fn from_url(url: &str) -> Result<Self, ParseTitleError> {
|
||||
let url = Url::parse(url.trim())?;
|
||||
let url = url.trim();
|
||||
if url.is_empty() {
|
||||
return Err(ParseTitleError::Empty);
|
||||
}
|
||||
|
||||
let url = Url::parse(url)?;
|
||||
|
||||
let (subdomain, host) = url
|
||||
.host_str()
|
||||
|
@ -79,10 +84,11 @@ impl Title {
|
|||
|
||||
// en:Article Title
|
||||
pub fn from_osm_tag(tag: &str) -> Result<Self, ParseTitleError> {
|
||||
let (lang, title) = tag
|
||||
.trim()
|
||||
.split_once(':')
|
||||
.ok_or(ParseTitleError::MissingColon)?;
|
||||
let tag = tag.trim();
|
||||
if tag.is_empty() {
|
||||
return Err(ParseTitleError::Empty);
|
||||
}
|
||||
let (lang, title) = tag.split_once(':').ok_or(ParseTitleError::MissingColon)?;
|
||||
|
||||
let lang = lang.trim_start();
|
||||
let title = title.trim_start();
|
||||
|
@ -125,9 +131,11 @@ impl Title {
|
|||
|
||||
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
|
||||
pub enum ParseTitleError {
|
||||
#[error("title cannot be empty or whitespace")]
|
||||
#[error("value is empty or whitespace")]
|
||||
Empty,
|
||||
#[error("title is empty or whitespace")]
|
||||
NoTitle,
|
||||
#[error("lang cannot be empty or whitespace")]
|
||||
#[error("lang is empty or whitespace")]
|
||||
NoLang,
|
||||
#[error("no ':' separating lang and title")]
|
||||
MissingColon,
|
||||
|
@ -141,7 +149,7 @@ pub enum ParseTitleError {
|
|||
NoHost,
|
||||
#[error("no subdomain in url")]
|
||||
NoSubdomain,
|
||||
#[error("url base domain is wikipedia.org")]
|
||||
#[error("url base domain is not wikipedia.org")]
|
||||
BadDomain,
|
||||
#[error("url base path is not /wiki/")]
|
||||
BadPath,
|
||||
|
|
Loading…
Add table
Reference in a new issue