Move file parsing out of wm module

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-08-25 15:24:39 -04:00 committed by Evan Lloyd New-Schmidt
parent faf4b760b2
commit 29d90376f3
5 changed files with 228 additions and 222 deletions

View file

@ -9,7 +9,8 @@ use anyhow::{anyhow, bail, Context};
use om_wikiparser::{
html::{self, HtmlError},
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, Title},
parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file,
wm::{Page, Title},
};
/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.

View file

@ -1,6 +1,57 @@
pub mod html;
pub mod osm;
pub mod wm;
use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr};
#[macro_use]
extern crate log;
use anyhow::Context;
pub mod html;
pub mod osm;
mod tag_file;
pub use tag_file::*;
pub mod wm;
use wm::{Qid, Title};
/// Read from a file of urls on each line.
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
let contents = fs::read_to_string(path.as_ref())?;
Ok(contents
.lines()
.enumerate()
.map(|(i, line)| {
Qid::from_str(line).with_context(|| {
let line_num = i + 1;
format!("on line {line_num}: {line:?}")
})
})
.filter_map(|r| match r {
Ok(qid) => Some(qid),
Err(e) => {
warn!("Could not parse QID: {:#}", e);
None
}
})
.collect())
}
/// Read article titles from a file of urls on each line.
pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
let contents = fs::read_to_string(path.as_ref())?;
Ok(contents
.lines()
.enumerate()
.map(|(i, line)| {
Title::from_url(line).with_context(|| {
let line_num = i + 1;
format!("on line {line_num}: {line:?}")
})
})
.filter_map(|r| match r {
Ok(qid) => Some(qid),
Err(e) => {
warn!("Could not parse wikipedia title: {:#}", e);
None
}
})
.collect())
}

View file

@ -119,12 +119,7 @@ fn main() -> anyhow::Result<()> {
let mut titles = HashSet::new();
let mut errors = Vec::new();
info!("Reading osm tag file");
om_wikiparser::wm::parse_osm_tag_file(
osm_tags,
&mut qids,
&mut titles,
Some(&mut errors),
)?;
om_wikiparser::parse_osm_tag_file(osm_tags, &mut qids, &mut titles, Some(&mut errors))?;
info!("Found {} errors in tag file", errors.len());
let mut writer = csv::WriterBuilder::new()
@ -134,7 +129,7 @@ fn main() -> anyhow::Result<()> {
writer.write_record(["line", "object", "version", "key", "error", "value"])?;
for error in errors {
use om_wikiparser::wm::ParseErrorKind::*;
use om_wikiparser::ParseErrorKind::*;
let key = match error.kind {
Title(_) => "wikipedia",
Qid(_) => "wikidata",

170
src/tag_file.rs Normal file
View file

@ -0,0 +1,170 @@
use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, str::FromStr};
use anyhow::{anyhow, bail};
use crate::{
osm,
wm::{ParseQidError, ParseTitleError, Qid, Title},
};
/// Read a TSV file of OSM tags, using wikipedia/wikidata tags.
pub fn parse_osm_tag_file(
path: impl AsRef<OsStr>,
qids: &mut HashSet<Qid>,
titles: &mut HashSet<Title>,
mut line_errors: Option<&mut Vec<ParseLineError>>,
) -> anyhow::Result<()> {
let path = path.as_ref();
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
let mut push_error = |e: ParseLineError| {
debug!("Tag parse error: {e}");
if let Some(ref mut errs) = line_errors {
errs.push(e);
}
};
let mut qid_col = None;
let mut title_col = None;
let mut osm_id_col = None;
let mut osm_otype_col = None;
let mut osm_oname_col = None;
let mut osm_version_col = None;
for (column, title) in rdr.headers()?.iter().enumerate() {
match title {
"wikidata" => qid_col = Some(column),
"wikipedia" => title_col = Some(column),
"@id" => osm_id_col = Some(column),
"@otype" => osm_otype_col = Some(column),
"@oname" => osm_oname_col = Some(column),
"@version" => osm_version_col = Some(column),
_ => (),
}
}
let qid_col = qid_col.ok_or_else(|| anyhow!("Cannot find 'wikidata' column"))?;
let title_col = title_col.ok_or_else(|| anyhow!("Cannot find 'wikipedia' column"))?;
let mut row = csv::StringRecord::new();
loop {
match rdr.read_record(&mut row) {
Ok(true) => {}
// finished
Ok(false) => break,
// attempt to recover from parsing errors
Err(e) => {
if e.is_io_error() {
bail!(e)
}
push_error(ParseLineError {
kind: e.into(),
text: String::new(),
line: rdr.position().line(),
osm_id: None,
osm_type: None,
osm_version: None,
});
continue;
}
}
let parse_metadata = || {
(
osm_id_col.and_then(|i| row[i].trim().parse::<osm::Id>().ok()),
// Prefer otype, use oname if not available
osm_otype_col
.and_then(|i| row[i].trim().parse().ok())
.and_then(osm::Kind::from_otype)
.or_else(|| osm_oname_col.and_then(|i| osm::Kind::from_oname(&row[i]))),
osm_version_col.and_then(|i| row[i].trim().parse::<osm::Version>().ok()),
)
};
let qid = &row[qid_col].trim();
if !qid.is_empty() {
match Qid::from_str(qid) {
Ok(qid) => {
qids.insert(qid);
}
Err(e) => {
let (osm_id, osm_type, osm_version) = parse_metadata();
push_error(ParseLineError {
kind: e.into(),
text: qid.to_string(),
line: rdr.position().line(),
osm_id,
osm_type,
osm_version,
})
}
}
}
let title = &row[title_col].trim();
if !title.is_empty() {
match Title::from_osm_tag(title) {
Ok(title) => {
titles.insert(title);
}
Err(e) => {
let (osm_id, osm_type, osm_version) = parse_metadata();
push_error(ParseLineError {
kind: e.into(),
text: title.to_string(),
line: rdr.position().line(),
osm_id,
osm_type,
osm_version,
})
}
}
}
}
Ok(())
}
#[derive(Debug, thiserror::Error)]
pub enum ParseErrorKind {
#[error("title")]
Title(#[from] ParseTitleError),
#[error("QID")]
Qid(#[from] ParseQidError),
#[error("TSV line")]
Tsv(#[from] csv::Error),
}
#[derive(Debug)]
pub struct ParseLineError {
pub kind: ParseErrorKind,
pub text: String,
pub line: u64,
pub osm_id: Option<osm::Id>,
pub osm_type: Option<osm::Kind>,
pub osm_version: Option<osm::Version>,
}
impl Display for ParseLineError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "on line {}", self.line)?;
if let Some(osm_id) = self.osm_id {
write!(f, " ({osm_id})")?;
}
write!(f, ": {} {:?}", self.kind, self.text)?;
// Write source error chain to ensure they are logged.
let mut source = self.kind.source();
while let Some(e) = source {
write!(f, ": {}", e)?;
source = e.source();
}
Ok(())
}
}
impl Error for ParseLineError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
// Return nothing because Display prints source chain.
None
}
}

View file

@ -1,218 +1,7 @@
//! Wikimedia types
use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, fs, str::FromStr};
use anyhow::{anyhow, bail, Context};
mod page;
pub use page::Page;
mod title;
pub use title::*;
mod qid;
pub use qid::*;
use crate::osm;
/// Read from a file of urls on each line.
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
let contents = fs::read_to_string(path.as_ref())?;
Ok(contents
.lines()
.enumerate()
.map(|(i, line)| {
Qid::from_str(line).with_context(|| {
let line_num = i + 1;
format!("on line {line_num}: {line:?}")
})
})
.filter_map(|r| match r {
Ok(qid) => Some(qid),
Err(e) => {
warn!("Could not parse QID: {:#}", e);
None
}
})
.collect())
}
/// Read article titles from a file of urls on each line.
pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
let contents = fs::read_to_string(path.as_ref())?;
Ok(contents
.lines()
.enumerate()
.map(|(i, line)| {
Title::from_url(line).with_context(|| {
let line_num = i + 1;
format!("on line {line_num}: {line:?}")
})
})
.filter_map(|r| match r {
Ok(qid) => Some(qid),
Err(e) => {
warn!("Could not parse wikipedia title: {:#}", e);
None
}
})
.collect())
}
pub fn parse_osm_tag_file(
path: impl AsRef<OsStr>,
qids: &mut HashSet<Qid>,
titles: &mut HashSet<Title>,
mut line_errors: Option<&mut Vec<ParseLineError>>,
) -> anyhow::Result<()> {
let path = path.as_ref();
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
let mut push_error = |e: ParseLineError| {
debug!("Tag parse error: {e}");
if let Some(ref mut errs) = line_errors {
errs.push(e);
}
};
let mut qid_col = None;
let mut title_col = None;
let mut osm_id_col = None;
let mut osm_otype_col = None;
let mut osm_oname_col = None;
let mut osm_version_col = None;
for (column, title) in rdr.headers()?.iter().enumerate() {
match title {
"wikidata" => qid_col = Some(column),
"wikipedia" => title_col = Some(column),
"@id" => osm_id_col = Some(column),
"@otype" => osm_otype_col = Some(column),
"@oname" => osm_oname_col = Some(column),
"@version" => osm_version_col = Some(column),
_ => (),
}
}
let qid_col = qid_col.ok_or_else(|| anyhow!("Cannot find 'wikidata' column"))?;
let title_col = title_col.ok_or_else(|| anyhow!("Cannot find 'wikipedia' column"))?;
let mut row = csv::StringRecord::new();
loop {
match rdr.read_record(&mut row) {
Ok(true) => {}
// finished
Ok(false) => break,
// attempt to recover from parsing errors
Err(e) => {
if e.is_io_error() {
bail!(e)
}
push_error(ParseLineError {
kind: e.into(),
text: String::new(),
line: rdr.position().line(),
osm_id: None,
osm_type: None,
osm_version: None,
});
continue;
}
}
let parse_metadata = || {
(
osm_id_col.and_then(|i| row[i].trim().parse::<osm::Id>().ok()),
// Prefer otype, use oname if not available
osm_otype_col
.and_then(|i| row[i].trim().parse().ok())
.and_then(osm::Kind::from_otype)
.or_else(|| osm_oname_col.and_then(|i| osm::Kind::from_oname(&row[i]))),
osm_version_col.and_then(|i| row[i].trim().parse::<osm::Version>().ok()),
)
};
let qid = &row[qid_col].trim();
if !qid.is_empty() {
match Qid::from_str(qid) {
Ok(qid) => {
qids.insert(qid);
}
Err(e) => {
let (osm_id, osm_type, osm_version) = parse_metadata();
push_error(ParseLineError {
kind: e.into(),
text: qid.to_string(),
line: rdr.position().line(),
osm_id,
osm_type,
osm_version,
})
}
}
}
let title = &row[title_col].trim();
if !title.is_empty() {
match Title::from_osm_tag(title) {
Ok(title) => {
titles.insert(title);
}
Err(e) => {
let (osm_id, osm_type, osm_version) = parse_metadata();
push_error(ParseLineError {
kind: e.into(),
text: title.to_string(),
line: rdr.position().line(),
osm_id,
osm_type,
osm_version,
})
}
}
}
}
Ok(())
}
#[derive(Debug, thiserror::Error)]
pub enum ParseErrorKind {
#[error("title")]
Title(#[from] ParseTitleError),
#[error("QID")]
Qid(#[from] ParseQidError),
#[error("TSV line")]
Tsv(#[from] csv::Error),
}
#[derive(Debug)]
pub struct ParseLineError {
pub kind: ParseErrorKind,
pub text: String,
pub line: u64,
pub osm_id: Option<osm::Id>,
pub osm_type: Option<osm::Kind>,
pub osm_version: Option<osm::Version>,
}
impl Display for ParseLineError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "on line {}", self.line)?;
if let Some(osm_id) = self.osm_id {
write!(f, " ({osm_id})")?;
}
write!(f, ": {} {:?}", self.kind, self.text)?;
// Write source error chain to ensure they are logged.
let mut source = self.kind.source();
while let Some(e) = source {
write!(f, ": {}", e)?;
source = e.source();
}
Ok(())
}
}
impl Error for ParseLineError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
// Return nothing because Display prints source chain.
None
}
}