Structure parse errors and only log warning if above threshold

- Add custom error types with `thiserror` crate in preparation for #25.
- Parsing errors are captured instead of logged to `warn` by default.
    - All parsing errors are still logged to `debug` level.
    - If >= 0.02% of tags can't be parsed, an error is logged.
    - TSV line errors are always logged as errors.
    - I/O errors will fail instead of be logged.

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-08-09 14:10:40 -04:00 committed by Evan Lloyd New-Schmidt
parent 34bb9318d5
commit 941d2b1032
7 changed files with 155 additions and 40 deletions

1
Cargo.lock generated
View file

@ -692,6 +692,7 @@ dependencies = [
"scraper",
"serde",
"serde_json",
"thiserror",
"url",
"urlencoding",
]

View file

@ -20,6 +20,7 @@ rayon = "1.7.0"
scraper = "0.16.0"
serde = { version = "1.0.163", features = ["derive"] }
serde_json = "1.0.96"
thiserror = "1.0.44"
url = "2.3.1"
urlencoding = "2.1.2"

View file

@ -60,9 +60,34 @@ pub fn run(args: Args) -> anyhow::Result<()> {
Default::default()
};
if let Some(path) = args.osm_tags {
if let Some(ref path) = args.osm_tags {
info!("Loading wikipedia/wikidata osm tags from {path:?}");
parse_osm_tag_file(path, &mut wikidata_qids, &mut wikipedia_titles)?;
let original_items = wikidata_qids.len() + wikipedia_titles.len();
let mut line_errors = Vec::new();
parse_osm_tag_file(
path,
&mut wikidata_qids,
&mut wikipedia_titles,
Some(&mut line_errors),
)?;
if !line_errors.is_empty() {
let error_count = line_errors.len();
let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items;
let expected_threshold = 0.02;
let percentage = 100.0 * error_count as f64 / new_items as f64;
let level = if percentage >= expected_threshold {
log::Level::Error
} else {
log::Level::Info
};
log!(
level,
"{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",
);
}
}
debug!("Parsed {} unique article titles", wikipedia_titles.len());

View file

@ -1,7 +1,7 @@
//! Wikimedia types
use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr};
use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, fs, str::FromStr};
use anyhow::{anyhow, Context};
use anyhow::{anyhow, bail, Context};
mod page;
pub use page::Page;
@ -58,10 +58,18 @@ pub fn parse_osm_tag_file(
path: impl AsRef<OsStr>,
qids: &mut HashSet<Qid>,
titles: &mut HashSet<Title>,
mut line_errors: Option<&mut Vec<ParseLineError>>,
) -> anyhow::Result<()> {
let path = path.as_ref();
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
let mut push_error = |e: ParseLineError| {
debug!("Tag parse error: {e}");
if let Some(ref mut errs) = line_errors {
errs.push(e);
}
};
let mut qid_col = None;
let mut title_col = None;
for (column, title) in rdr.headers()?.iter().enumerate() {
@ -83,7 +91,14 @@ pub fn parse_osm_tag_file(
Ok(false) => break,
// attempt to recover from parsing errors
Err(e) => {
error!("Error parsing tsv file: {}", e);
if e.is_io_error() {
bail!(e)
}
push_error(ParseLineError {
text: String::new(),
line: rdr.position().line(),
kind: e.into(),
});
continue;
}
}
@ -94,13 +109,11 @@ pub fn parse_osm_tag_file(
Ok(qid) => {
qids.insert(qid);
}
Err(e) => warn!(
"Cannot parse qid {:?} on line {} in {:?}: {}",
qid,
rdr.position().line(),
path,
e
),
Err(e) => push_error(ParseLineError {
text: qid.to_string(),
line: rdr.position().line(),
kind: e.into(),
}),
}
}
@ -110,16 +123,51 @@ pub fn parse_osm_tag_file(
Ok(title) => {
titles.insert(title);
}
Err(e) => warn!(
"Cannot parse title {:?} on line {} in {:?}: {}",
title,
rdr.position().line(),
path,
e
),
Err(e) => push_error(ParseLineError {
text: title.to_string(),
line: rdr.position().line(),
kind: e.into(),
}),
}
}
}
Ok(())
}
#[derive(Debug, thiserror::Error)]
pub enum ParseErrorKind {
#[error("bad title")]
Title(#[from] ParseTitleError),
#[error("bad QID")]
Qid(#[from] ParseQidError),
#[error("bad TSV line")]
Tsv(#[from] csv::Error),
}
#[derive(Debug)]
pub struct ParseLineError {
text: String,
line: u64,
kind: ParseErrorKind,
}
impl Display for ParseLineError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// write source chain to ensure they are logged
write!(f, "on line {}: {:?}: {}", self.line, self.text, self.kind)?;
let mut source = self.kind.source();
while let Some(e) = source {
write!(f, ": {}", e)?;
source = e.source();
}
Ok(())
}
}
impl Error for ParseLineError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
// return nothing b/c Display prints source chain
None
}
}

View file

@ -1,5 +1,6 @@
use std::{iter, str::FromStr};
use anyhow::Context;
use serde::Deserialize;
use super::{Qid, Title};
@ -35,6 +36,7 @@ impl Page {
/// Title of the article
pub fn title(&self) -> anyhow::Result<Title> {
Title::from_title(&self.name, &self.in_language.identifier)
.with_context(|| format!("bad title {:?}", self.name))
}
/// All titles that lead to the article, the main title followed by any redirects.
@ -43,9 +45,10 @@ impl Page {
}
pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
self.redirects
.iter()
.map(|r| Title::from_title(&r.name, &self.in_language.identifier))
self.redirects.iter().map(|r| {
Title::from_title(&r.name, &self.in_language.identifier)
.with_context(|| format!("bad redirect {:?}", self.name))
})
}
}

View file

@ -1,4 +1,4 @@
use std::{fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr};
use std::{error::Error, fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr};
/// Wikidata QID/Q Number
///
@ -21,15 +21,13 @@ use std::{fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr};
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct Qid(u32);
pub type ParseQidError = ParseIntError;
impl FromStr for Qid {
type Err = ParseQidError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.trim();
let s = s.strip_prefix(['Q', 'q']).unwrap_or(s);
u32::from_str(s).map(Qid)
u32::from_str(s).map(Qid).map_err(ParseQidError)
}
}
@ -49,3 +47,18 @@ impl Qid {
path
}
}
#[derive(Debug, PartialEq, Eq)]
pub struct ParseQidError(ParseIntError);
impl Display for ParseQidError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}
impl Error for ParseQidError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
self.0.source()
}
}

View file

@ -1,6 +1,4 @@
use std::{fmt::Display, path::PathBuf};
use anyhow::{anyhow, bail};
use std::{fmt::Display, path::PathBuf, string::FromUtf8Error};
use url::Url;
@ -49,17 +47,17 @@ impl Title {
}
// https://en.wikipedia.org/wiki/Article_Title/More_Title
pub fn from_url(url: &str) -> anyhow::Result<Self> {
pub fn from_url(url: &str) -> Result<Self, ParseTitleError> {
let url = Url::parse(url.trim())?;
let (subdomain, host) = url
.host_str()
.ok_or_else(|| anyhow!("Expected host"))?
.ok_or(ParseTitleError::NoHost)?
.split_once('.')
.ok_or_else(|| anyhow!("Expected subdomain"))?;
.ok_or(ParseTitleError::NoSubdomain)?;
let host = host.strip_prefix("m.").unwrap_or(host);
if host != "wikipedia.org" {
bail!("Expected wikipedia.org for domain")
return Err(ParseTitleError::BadDomain);
}
let lang = subdomain;
@ -69,10 +67,10 @@ impl Title {
.strip_prefix('/')
.unwrap_or(path)
.split_once('/')
.ok_or_else(|| anyhow!("Expected at least two segments in path"))?;
.ok_or(ParseTitleError::ShortPath)?;
if root != "wiki" {
bail!("Expected 'wiki' as root path, got: {:?}", root)
return Err(ParseTitleError::BadPath);
}
let title = urlencoding::decode(title)?;
@ -80,11 +78,11 @@ impl Title {
}
// en:Article Title
pub fn from_osm_tag(tag: &str) -> anyhow::Result<Self> {
pub fn from_osm_tag(tag: &str) -> Result<Self, ParseTitleError> {
let (lang, title) = tag
.trim()
.split_once(':')
.ok_or_else(|| anyhow!("Expected ':'"))?;
.ok_or(ParseTitleError::MissingColon)?;
let lang = lang.trim_start();
let title = title.trim_start();
@ -100,14 +98,14 @@ impl Title {
Self::from_title(title, lang)
}
pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> {
pub fn from_title(title: &str, lang: &str) -> Result<Self, ParseTitleError> {
let title = title.trim();
let lang = lang.trim();
if title.is_empty() {
bail!("title cannot be empty or whitespace");
return Err(ParseTitleError::NoTitle);
}
if lang.is_empty() {
bail!("lang cannot be empty or whitespace");
return Err(ParseTitleError::NoLang);
}
let name = Self::normalize_title(title);
let lang = lang.to_owned();
@ -124,3 +122,29 @@ impl Title {
path
}
}
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
pub enum ParseTitleError {
#[error("title cannot be empty or whitespace")]
NoTitle,
#[error("lang cannot be empty or whitespace")]
NoLang,
#[error("no ':' separating lang and title")]
MissingColon,
// url-specific
#[error("cannot parse url")]
Url(#[from] url::ParseError),
#[error("cannot decode url")]
UrlDecode(#[from] FromUtf8Error),
#[error("no host in url")]
NoHost,
#[error("no subdomain in url")]
NoSubdomain,
#[error("url base domain is wikipedia.org")]
BadDomain,
#[error("url base path is not /wiki/")]
BadPath,
#[error("path has less than 2 segments")]
ShortPath,
}