diff --git a/Cargo.lock b/Cargo.lock index 7b83221..025b3da 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -692,6 +692,7 @@ dependencies = [ "scraper", "serde", "serde_json", + "thiserror", "url", "urlencoding", ] diff --git a/Cargo.toml b/Cargo.toml index 86f2f86..462d6c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ rayon = "1.7.0" scraper = "0.16.0" serde = { version = "1.0.163", features = ["derive"] } serde_json = "1.0.96" +thiserror = "1.0.44" url = "2.3.1" urlencoding = "2.1.2" diff --git a/src/get_articles.rs b/src/get_articles.rs index b0cbdab..54a3b68 100644 --- a/src/get_articles.rs +++ b/src/get_articles.rs @@ -60,9 +60,34 @@ pub fn run(args: Args) -> anyhow::Result<()> { Default::default() }; - if let Some(path) = args.osm_tags { + if let Some(ref path) = args.osm_tags { info!("Loading wikipedia/wikidata osm tags from {path:?}"); - parse_osm_tag_file(path, &mut wikidata_qids, &mut wikipedia_titles)?; + + let original_items = wikidata_qids.len() + wikipedia_titles.len(); + let mut line_errors = Vec::new(); + parse_osm_tag_file( + path, + &mut wikidata_qids, + &mut wikipedia_titles, + Some(&mut line_errors), + )?; + + if !line_errors.is_empty() { + let error_count = line_errors.len(); + let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items; + let expected_threshold = 0.02; + let percentage = 100.0 * error_count as f64 / new_items as f64; + let level = if percentage >= expected_threshold { + log::Level::Error + } else { + log::Level::Info + }; + + log!( + level, + "{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}", + ); + } } debug!("Parsed {} unique article titles", wikipedia_titles.len()); diff --git a/src/wm/mod.rs b/src/wm/mod.rs index a78167d..530b41a 100644 --- a/src/wm/mod.rs +++ b/src/wm/mod.rs @@ -1,7 +1,7 @@ //! Wikimedia types -use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr}; +use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, fs, str::FromStr}; -use anyhow::{anyhow, Context}; +use anyhow::{anyhow, bail, Context}; mod page; pub use page::Page; @@ -58,10 +58,18 @@ pub fn parse_osm_tag_file( path: impl AsRef, qids: &mut HashSet, titles: &mut HashSet, + mut line_errors: Option<&mut Vec<ParseLineError>>, ) -> anyhow::Result<()> { let path = path.as_ref(); let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?; + let mut push_error = |e: ParseLineError| { + debug!("Tag parse error: {e}"); + if let Some(ref mut errs) = line_errors { + errs.push(e); + } + }; + let mut qid_col = None; let mut title_col = None; for (column, title) in rdr.headers()?.iter().enumerate() { @@ -83,7 +91,14 @@ pub fn parse_osm_tag_file( Ok(false) => break, // attempt to recover from parsing errors Err(e) => { - error!("Error parsing tsv file: {}", e); + if e.is_io_error() { + bail!(e) + } + push_error(ParseLineError { + text: String::new(), + line: rdr.position().line(), + kind: e.into(), + }); continue; } } @@ -94,13 +109,11 @@ pub fn parse_osm_tag_file( Ok(qid) => { qids.insert(qid); } - Err(e) => warn!( - "Cannot parse qid {:?} on line {} in {:?}: {}", - qid, - rdr.position().line(), - path, - e - ), + Err(e) => push_error(ParseLineError { + text: qid.to_string(), + line: rdr.position().line(), + kind: e.into(), + }), } } @@ -110,16 +123,51 @@ pub fn parse_osm_tag_file( Ok(title) => { titles.insert(title); } - Err(e) => warn!( - "Cannot parse title {:?} on line {} in {:?}: {}", - title, - rdr.position().line(), - path, - e - ), + Err(e) => push_error(ParseLineError { + text: title.to_string(), + line: rdr.position().line(), + kind: e.into(), + }), } } } Ok(()) } + +#[derive(Debug, thiserror::Error)] +pub enum ParseErrorKind { + #[error("bad title")] + Title(#[from] ParseTitleError), + #[error("bad QID")] + Qid(#[from] ParseQidError), + #[error("bad TSV line")] + Tsv(#[from] csv::Error), +} + +#[derive(Debug)] +pub struct ParseLineError { + text: String, + line: u64, + kind: ParseErrorKind, +} + +impl Display for ParseLineError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // write source chain to ensure they are logged + write!(f, "on line {}: {:?}: {}", self.line, self.text, self.kind)?; + let mut source = self.kind.source(); + while let Some(e) = source { + write!(f, ": {}", e)?; + source = e.source(); + } + Ok(()) + } +} + +impl Error for ParseLineError { + fn source(&self) -> Option<&(dyn Error + 'static)> { + // return nothing b/c Display prints source chain + None + } +} diff --git a/src/wm/page.rs b/src/wm/page.rs index c77cf87..239f1cd 100644 --- a/src/wm/page.rs +++ b/src/wm/page.rs @@ -1,5 +1,6 @@ use std::{iter, str::FromStr}; +use anyhow::Context; use serde::Deserialize; use super::{Qid, Title}; @@ -35,6 +36,7 @@ impl Page { /// Title of the article pub fn title(&self) -> anyhow::Result<Title> { Title::from_title(&self.name, &self.in_language.identifier) + .with_context(|| format!("bad title {:?}", self.name)) } /// All titles that lead to the article, the main title followed by any redirects. @@ -43,9 +45,10 @@ impl Page { } pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ { - self.redirects - .iter() - .map(|r| Title::from_title(&r.name, &self.in_language.identifier)) + self.redirects.iter().map(|r| { + Title::from_title(&r.name, &self.in_language.identifier) + .with_context(|| format!("bad redirect {:?}", self.name)) + }) } } diff --git a/src/wm/qid.rs b/src/wm/qid.rs index 29fc7d3..f908ae4 100644 --- a/src/wm/qid.rs +++ b/src/wm/qid.rs @@ -1,4 +1,4 @@ -use std::{fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr}; +use std::{error::Error, fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr}; /// Wikidata QID/Q Number /// @@ -21,15 +21,13 @@ use std::{fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr}; #[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)] pub struct Qid(u32); -pub type ParseQidError = ParseIntError; - impl FromStr for Qid { type Err = ParseQidError; fn from_str(s: &str) -> Result<Self, Self::Err> { let s = s.trim(); let s = s.strip_prefix(['Q', 'q']).unwrap_or(s); - u32::from_str(s).map(Qid) + u32::from_str(s).map(Qid).map_err(ParseQidError) } } @@ -49,3 +47,18 @@ impl Qid { path } } + +#[derive(Debug, PartialEq, Eq)] +pub struct ParseQidError(ParseIntError); + +impl Display for ParseQidError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +impl Error for ParseQidError { + fn source(&self) -> Option<&(dyn Error + 'static)> { + self.0.source() + } +} diff --git a/src/wm/title.rs b/src/wm/title.rs index e06dee0..5d0879a 100644 --- a/src/wm/title.rs +++ b/src/wm/title.rs @@ -1,6 +1,4 @@ -use std::{fmt::Display, path::PathBuf}; - -use anyhow::{anyhow, bail}; +use std::{fmt::Display, path::PathBuf, string::FromUtf8Error}; use url::Url; @@ -49,17 +47,17 @@ impl Title { } // https://en.wikipedia.org/wiki/Article_Title/More_Title - pub fn from_url(url: &str) -> anyhow::Result<Self> { + pub fn from_url(url: &str) -> Result<Self, ParseTitleError> { let url = Url::parse(url.trim())?; let (subdomain, host) = url .host_str() - .ok_or_else(|| anyhow!("Expected host"))? + .ok_or(ParseTitleError::NoHost)? .split_once('.') - .ok_or_else(|| anyhow!("Expected subdomain"))?; + .ok_or(ParseTitleError::NoSubdomain)?; let host = host.strip_prefix("m.").unwrap_or(host); if host != "wikipedia.org" { - bail!("Expected wikipedia.org for domain") + return Err(ParseTitleError::BadDomain); } let lang = subdomain; @@ -69,10 +67,10 @@ impl Title { .strip_prefix('/') .unwrap_or(path) .split_once('/') - .ok_or_else(|| anyhow!("Expected at least two segments in path"))?; + .ok_or(ParseTitleError::ShortPath)?; if root != "wiki" { - bail!("Expected 'wiki' as root path, got: {:?}", root) + return Err(ParseTitleError::BadPath); } let title = urlencoding::decode(title)?; @@ -80,11 +78,11 @@ impl Title { } // en:Article Title - pub fn from_osm_tag(tag: &str) -> anyhow::Result<Self> { + pub fn from_osm_tag(tag: &str) -> Result<Self, ParseTitleError> { let (lang, title) = tag .trim() .split_once(':') - .ok_or_else(|| anyhow!("Expected ':'"))?; + .ok_or(ParseTitleError::MissingColon)?; let lang = lang.trim_start(); let title = title.trim_start(); @@ -100,14 +98,14 @@ impl Title { Self::from_title(title, lang) } - pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> { + pub fn from_title(title: &str, lang: &str) -> Result<Self, ParseTitleError> { let title = title.trim(); let lang = lang.trim(); if title.is_empty() { - bail!("title cannot be empty or whitespace"); + return Err(ParseTitleError::NoTitle); } if lang.is_empty() { - bail!("lang cannot be empty or whitespace"); + return Err(ParseTitleError::NoLang); } let name = Self::normalize_title(title); let lang = lang.to_owned(); @@ -124,3 +122,29 @@ impl Title { path } } + +#[derive(Debug, PartialEq, Eq, thiserror::Error)] +pub enum ParseTitleError { + #[error("title cannot be empty or whitespace")] + NoTitle, + #[error("lang cannot be empty or whitespace")] + NoLang, + #[error("no ':' separating lang and title")] + MissingColon, + + // url-specific + #[error("cannot parse url")] + Url(#[from] url::ParseError), + #[error("cannot decode url")] + UrlDecode(#[from] FromUtf8Error), + #[error("no host in url")] + NoHost, + #[error("no subdomain in url")] + NoSubdomain, + #[error("url base domain is wikipedia.org")] + BadDomain, + #[error("url base path is not /wiki/")] + BadPath, + #[error("path has less than 2 segments")] + ShortPath, +}