Structure parse errors and only log warning if above threshold
- Add custom error types with `thiserror` crate in preparation for #25. - Parsing errors are captured instead of logged to `warn` by default. - All parsing errors are still logged to `debug` level. - If >= 0.02% of tags can't be parsed, an error is logged. - TSV line errors are always logged as errors. - I/O errors will fail instead of be logged. Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
34bb9318d5
commit
941d2b1032
7 changed files with 155 additions and 40 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -692,6 +692,7 @@ dependencies = [
|
|||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"url",
|
||||
"urlencoding",
|
||||
]
|
||||
|
|
|
@ -20,6 +20,7 @@ rayon = "1.7.0"
|
|||
scraper = "0.16.0"
|
||||
serde = { version = "1.0.163", features = ["derive"] }
|
||||
serde_json = "1.0.96"
|
||||
thiserror = "1.0.44"
|
||||
url = "2.3.1"
|
||||
urlencoding = "2.1.2"
|
||||
|
||||
|
|
|
@ -60,9 +60,34 @@ pub fn run(args: Args) -> anyhow::Result<()> {
|
|||
Default::default()
|
||||
};
|
||||
|
||||
if let Some(path) = args.osm_tags {
|
||||
if let Some(ref path) = args.osm_tags {
|
||||
info!("Loading wikipedia/wikidata osm tags from {path:?}");
|
||||
parse_osm_tag_file(path, &mut wikidata_qids, &mut wikipedia_titles)?;
|
||||
|
||||
let original_items = wikidata_qids.len() + wikipedia_titles.len();
|
||||
let mut line_errors = Vec::new();
|
||||
parse_osm_tag_file(
|
||||
path,
|
||||
&mut wikidata_qids,
|
||||
&mut wikipedia_titles,
|
||||
Some(&mut line_errors),
|
||||
)?;
|
||||
|
||||
if !line_errors.is_empty() {
|
||||
let error_count = line_errors.len();
|
||||
let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items;
|
||||
let expected_threshold = 0.02;
|
||||
let percentage = 100.0 * error_count as f64 / new_items as f64;
|
||||
let level = if percentage >= expected_threshold {
|
||||
log::Level::Error
|
||||
} else {
|
||||
log::Level::Info
|
||||
};
|
||||
|
||||
log!(
|
||||
level,
|
||||
"{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Parsed {} unique article titles", wikipedia_titles.len());
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
//! Wikimedia types
|
||||
use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr};
|
||||
use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, fs, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
|
||||
mod page;
|
||||
pub use page::Page;
|
||||
|
@ -58,10 +58,18 @@ pub fn parse_osm_tag_file(
|
|||
path: impl AsRef<OsStr>,
|
||||
qids: &mut HashSet<Qid>,
|
||||
titles: &mut HashSet<Title>,
|
||||
mut line_errors: Option<&mut Vec<ParseLineError>>,
|
||||
) -> anyhow::Result<()> {
|
||||
let path = path.as_ref();
|
||||
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
|
||||
|
||||
let mut push_error = |e: ParseLineError| {
|
||||
debug!("Tag parse error: {e}");
|
||||
if let Some(ref mut errs) = line_errors {
|
||||
errs.push(e);
|
||||
}
|
||||
};
|
||||
|
||||
let mut qid_col = None;
|
||||
let mut title_col = None;
|
||||
for (column, title) in rdr.headers()?.iter().enumerate() {
|
||||
|
@ -83,7 +91,14 @@ pub fn parse_osm_tag_file(
|
|||
Ok(false) => break,
|
||||
// attempt to recover from parsing errors
|
||||
Err(e) => {
|
||||
error!("Error parsing tsv file: {}", e);
|
||||
if e.is_io_error() {
|
||||
bail!(e)
|
||||
}
|
||||
push_error(ParseLineError {
|
||||
text: String::new(),
|
||||
line: rdr.position().line(),
|
||||
kind: e.into(),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -94,13 +109,11 @@ pub fn parse_osm_tag_file(
|
|||
Ok(qid) => {
|
||||
qids.insert(qid);
|
||||
}
|
||||
Err(e) => warn!(
|
||||
"Cannot parse qid {:?} on line {} in {:?}: {}",
|
||||
qid,
|
||||
rdr.position().line(),
|
||||
path,
|
||||
e
|
||||
),
|
||||
Err(e) => push_error(ParseLineError {
|
||||
text: qid.to_string(),
|
||||
line: rdr.position().line(),
|
||||
kind: e.into(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -110,16 +123,51 @@ pub fn parse_osm_tag_file(
|
|||
Ok(title) => {
|
||||
titles.insert(title);
|
||||
}
|
||||
Err(e) => warn!(
|
||||
"Cannot parse title {:?} on line {} in {:?}: {}",
|
||||
title,
|
||||
rdr.position().line(),
|
||||
path,
|
||||
e
|
||||
),
|
||||
Err(e) => push_error(ParseLineError {
|
||||
text: title.to_string(),
|
||||
line: rdr.position().line(),
|
||||
kind: e.into(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ParseErrorKind {
|
||||
#[error("bad title")]
|
||||
Title(#[from] ParseTitleError),
|
||||
#[error("bad QID")]
|
||||
Qid(#[from] ParseQidError),
|
||||
#[error("bad TSV line")]
|
||||
Tsv(#[from] csv::Error),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ParseLineError {
|
||||
text: String,
|
||||
line: u64,
|
||||
kind: ParseErrorKind,
|
||||
}
|
||||
|
||||
impl Display for ParseLineError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// write source chain to ensure they are logged
|
||||
write!(f, "on line {}: {:?}: {}", self.line, self.text, self.kind)?;
|
||||
let mut source = self.kind.source();
|
||||
while let Some(e) = source {
|
||||
write!(f, ": {}", e)?;
|
||||
source = e.source();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for ParseLineError {
|
||||
fn source(&self) -> Option<&(dyn Error + 'static)> {
|
||||
// return nothing b/c Display prints source chain
|
||||
None
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
use std::{iter, str::FromStr};
|
||||
|
||||
use anyhow::Context;
|
||||
use serde::Deserialize;
|
||||
|
||||
use super::{Qid, Title};
|
||||
|
@ -35,6 +36,7 @@ impl Page {
|
|||
/// Title of the article
|
||||
pub fn title(&self) -> anyhow::Result<Title> {
|
||||
Title::from_title(&self.name, &self.in_language.identifier)
|
||||
.with_context(|| format!("bad title {:?}", self.name))
|
||||
}
|
||||
|
||||
/// All titles that lead to the article, the main title followed by any redirects.
|
||||
|
@ -43,9 +45,10 @@ impl Page {
|
|||
}
|
||||
|
||||
pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
|
||||
self.redirects
|
||||
.iter()
|
||||
.map(|r| Title::from_title(&r.name, &self.in_language.identifier))
|
||||
self.redirects.iter().map(|r| {
|
||||
Title::from_title(&r.name, &self.in_language.identifier)
|
||||
.with_context(|| format!("bad redirect {:?}", self.name))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
use std::{fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr};
|
||||
use std::{error::Error, fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr};
|
||||
|
||||
/// Wikidata QID/Q Number
|
||||
///
|
||||
|
@ -21,15 +21,13 @@ use std::{fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr};
|
|||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct Qid(u32);
|
||||
|
||||
pub type ParseQidError = ParseIntError;
|
||||
|
||||
impl FromStr for Qid {
|
||||
type Err = ParseQidError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let s = s.trim();
|
||||
let s = s.strip_prefix(['Q', 'q']).unwrap_or(s);
|
||||
u32::from_str(s).map(Qid)
|
||||
u32::from_str(s).map(Qid).map_err(ParseQidError)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -49,3 +47,18 @@ impl Qid {
|
|||
path
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct ParseQidError(ParseIntError);
|
||||
|
||||
impl Display for ParseQidError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for ParseQidError {
|
||||
fn source(&self) -> Option<&(dyn Error + 'static)> {
|
||||
self.0.source()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
use std::{fmt::Display, path::PathBuf};
|
||||
|
||||
use anyhow::{anyhow, bail};
|
||||
use std::{fmt::Display, path::PathBuf, string::FromUtf8Error};
|
||||
|
||||
use url::Url;
|
||||
|
||||
|
@ -49,17 +47,17 @@ impl Title {
|
|||
}
|
||||
|
||||
// https://en.wikipedia.org/wiki/Article_Title/More_Title
|
||||
pub fn from_url(url: &str) -> anyhow::Result<Self> {
|
||||
pub fn from_url(url: &str) -> Result<Self, ParseTitleError> {
|
||||
let url = Url::parse(url.trim())?;
|
||||
|
||||
let (subdomain, host) = url
|
||||
.host_str()
|
||||
.ok_or_else(|| anyhow!("Expected host"))?
|
||||
.ok_or(ParseTitleError::NoHost)?
|
||||
.split_once('.')
|
||||
.ok_or_else(|| anyhow!("Expected subdomain"))?;
|
||||
.ok_or(ParseTitleError::NoSubdomain)?;
|
||||
let host = host.strip_prefix("m.").unwrap_or(host);
|
||||
if host != "wikipedia.org" {
|
||||
bail!("Expected wikipedia.org for domain")
|
||||
return Err(ParseTitleError::BadDomain);
|
||||
}
|
||||
let lang = subdomain;
|
||||
|
||||
|
@ -69,10 +67,10 @@ impl Title {
|
|||
.strip_prefix('/')
|
||||
.unwrap_or(path)
|
||||
.split_once('/')
|
||||
.ok_or_else(|| anyhow!("Expected at least two segments in path"))?;
|
||||
.ok_or(ParseTitleError::ShortPath)?;
|
||||
|
||||
if root != "wiki" {
|
||||
bail!("Expected 'wiki' as root path, got: {:?}", root)
|
||||
return Err(ParseTitleError::BadPath);
|
||||
}
|
||||
let title = urlencoding::decode(title)?;
|
||||
|
||||
|
@ -80,11 +78,11 @@ impl Title {
|
|||
}
|
||||
|
||||
// en:Article Title
|
||||
pub fn from_osm_tag(tag: &str) -> anyhow::Result<Self> {
|
||||
pub fn from_osm_tag(tag: &str) -> Result<Self, ParseTitleError> {
|
||||
let (lang, title) = tag
|
||||
.trim()
|
||||
.split_once(':')
|
||||
.ok_or_else(|| anyhow!("Expected ':'"))?;
|
||||
.ok_or(ParseTitleError::MissingColon)?;
|
||||
|
||||
let lang = lang.trim_start();
|
||||
let title = title.trim_start();
|
||||
|
@ -100,14 +98,14 @@ impl Title {
|
|||
Self::from_title(title, lang)
|
||||
}
|
||||
|
||||
pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> {
|
||||
pub fn from_title(title: &str, lang: &str) -> Result<Self, ParseTitleError> {
|
||||
let title = title.trim();
|
||||
let lang = lang.trim();
|
||||
if title.is_empty() {
|
||||
bail!("title cannot be empty or whitespace");
|
||||
return Err(ParseTitleError::NoTitle);
|
||||
}
|
||||
if lang.is_empty() {
|
||||
bail!("lang cannot be empty or whitespace");
|
||||
return Err(ParseTitleError::NoLang);
|
||||
}
|
||||
let name = Self::normalize_title(title);
|
||||
let lang = lang.to_owned();
|
||||
|
@ -124,3 +122,29 @@ impl Title {
|
|||
path
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
|
||||
pub enum ParseTitleError {
|
||||
#[error("title cannot be empty or whitespace")]
|
||||
NoTitle,
|
||||
#[error("lang cannot be empty or whitespace")]
|
||||
NoLang,
|
||||
#[error("no ':' separating lang and title")]
|
||||
MissingColon,
|
||||
|
||||
// url-specific
|
||||
#[error("cannot parse url")]
|
||||
Url(#[from] url::ParseError),
|
||||
#[error("cannot decode url")]
|
||||
UrlDecode(#[from] FromUtf8Error),
|
||||
#[error("no host in url")]
|
||||
NoHost,
|
||||
#[error("no subdomain in url")]
|
||||
NoSubdomain,
|
||||
#[error("url base domain is wikipedia.org")]
|
||||
BadDomain,
|
||||
#[error("url base path is not /wiki/")]
|
||||
BadPath,
|
||||
#[error("path has less than 2 segments")]
|
||||
ShortPath,
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue