Move file parsing out of wm module
Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
faf4b760b2
commit
29d90376f3
5 changed files with 228 additions and 222 deletions
|
@ -9,7 +9,8 @@ use anyhow::{anyhow, bail, Context};
|
|||
|
||||
use om_wikiparser::{
|
||||
html::{self, HtmlError},
|
||||
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, Title},
|
||||
parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file,
|
||||
wm::{Page, Title},
|
||||
};
|
||||
|
||||
/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
|
||||
|
|
57
src/lib.rs
57
src/lib.rs
|
@ -1,6 +1,57 @@
|
|||
pub mod html;
|
||||
pub mod osm;
|
||||
pub mod wm;
|
||||
use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr};
|
||||
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
use anyhow::Context;
|
||||
|
||||
pub mod html;
|
||||
pub mod osm;
|
||||
mod tag_file;
|
||||
pub use tag_file::*;
|
||||
pub mod wm;
|
||||
|
||||
use wm::{Qid, Title};
|
||||
|
||||
/// Read from a file of urls on each line.
|
||||
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
|
||||
let contents = fs::read_to_string(path.as_ref())?;
|
||||
Ok(contents
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
Qid::from_str(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("on line {line_num}: {line:?}")
|
||||
})
|
||||
})
|
||||
.filter_map(|r| match r {
|
||||
Ok(qid) => Some(qid),
|
||||
Err(e) => {
|
||||
warn!("Could not parse QID: {:#}", e);
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Read article titles from a file of urls on each line.
|
||||
pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
|
||||
let contents = fs::read_to_string(path.as_ref())?;
|
||||
Ok(contents
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
Title::from_url(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("on line {line_num}: {line:?}")
|
||||
})
|
||||
})
|
||||
.filter_map(|r| match r {
|
||||
Ok(qid) => Some(qid),
|
||||
Err(e) => {
|
||||
warn!("Could not parse wikipedia title: {:#}", e);
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
|
|
@ -119,12 +119,7 @@ fn main() -> anyhow::Result<()> {
|
|||
let mut titles = HashSet::new();
|
||||
let mut errors = Vec::new();
|
||||
info!("Reading osm tag file");
|
||||
om_wikiparser::wm::parse_osm_tag_file(
|
||||
osm_tags,
|
||||
&mut qids,
|
||||
&mut titles,
|
||||
Some(&mut errors),
|
||||
)?;
|
||||
om_wikiparser::parse_osm_tag_file(osm_tags, &mut qids, &mut titles, Some(&mut errors))?;
|
||||
info!("Found {} errors in tag file", errors.len());
|
||||
|
||||
let mut writer = csv::WriterBuilder::new()
|
||||
|
@ -134,7 +129,7 @@ fn main() -> anyhow::Result<()> {
|
|||
writer.write_record(["line", "object", "version", "key", "error", "value"])?;
|
||||
|
||||
for error in errors {
|
||||
use om_wikiparser::wm::ParseErrorKind::*;
|
||||
use om_wikiparser::ParseErrorKind::*;
|
||||
let key = match error.kind {
|
||||
Title(_) => "wikipedia",
|
||||
Qid(_) => "wikidata",
|
||||
|
|
170
src/tag_file.rs
Normal file
170
src/tag_file.rs
Normal file
|
@ -0,0 +1,170 @@
|
|||
use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, bail};
|
||||
|
||||
use crate::{
|
||||
osm,
|
||||
wm::{ParseQidError, ParseTitleError, Qid, Title},
|
||||
};
|
||||
|
||||
/// Read a TSV file of OSM tags, using wikipedia/wikidata tags.
|
||||
pub fn parse_osm_tag_file(
|
||||
path: impl AsRef<OsStr>,
|
||||
qids: &mut HashSet<Qid>,
|
||||
titles: &mut HashSet<Title>,
|
||||
mut line_errors: Option<&mut Vec<ParseLineError>>,
|
||||
) -> anyhow::Result<()> {
|
||||
let path = path.as_ref();
|
||||
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
|
||||
|
||||
let mut push_error = |e: ParseLineError| {
|
||||
debug!("Tag parse error: {e}");
|
||||
if let Some(ref mut errs) = line_errors {
|
||||
errs.push(e);
|
||||
}
|
||||
};
|
||||
|
||||
let mut qid_col = None;
|
||||
let mut title_col = None;
|
||||
let mut osm_id_col = None;
|
||||
let mut osm_otype_col = None;
|
||||
let mut osm_oname_col = None;
|
||||
let mut osm_version_col = None;
|
||||
for (column, title) in rdr.headers()?.iter().enumerate() {
|
||||
match title {
|
||||
"wikidata" => qid_col = Some(column),
|
||||
"wikipedia" => title_col = Some(column),
|
||||
"@id" => osm_id_col = Some(column),
|
||||
"@otype" => osm_otype_col = Some(column),
|
||||
"@oname" => osm_oname_col = Some(column),
|
||||
"@version" => osm_version_col = Some(column),
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
let qid_col = qid_col.ok_or_else(|| anyhow!("Cannot find 'wikidata' column"))?;
|
||||
let title_col = title_col.ok_or_else(|| anyhow!("Cannot find 'wikipedia' column"))?;
|
||||
|
||||
let mut row = csv::StringRecord::new();
|
||||
loop {
|
||||
match rdr.read_record(&mut row) {
|
||||
Ok(true) => {}
|
||||
// finished
|
||||
Ok(false) => break,
|
||||
// attempt to recover from parsing errors
|
||||
Err(e) => {
|
||||
if e.is_io_error() {
|
||||
bail!(e)
|
||||
}
|
||||
push_error(ParseLineError {
|
||||
kind: e.into(),
|
||||
text: String::new(),
|
||||
line: rdr.position().line(),
|
||||
osm_id: None,
|
||||
osm_type: None,
|
||||
osm_version: None,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let parse_metadata = || {
|
||||
(
|
||||
osm_id_col.and_then(|i| row[i].trim().parse::<osm::Id>().ok()),
|
||||
// Prefer otype, use oname if not available
|
||||
osm_otype_col
|
||||
.and_then(|i| row[i].trim().parse().ok())
|
||||
.and_then(osm::Kind::from_otype)
|
||||
.or_else(|| osm_oname_col.and_then(|i| osm::Kind::from_oname(&row[i]))),
|
||||
osm_version_col.and_then(|i| row[i].trim().parse::<osm::Version>().ok()),
|
||||
)
|
||||
};
|
||||
|
||||
let qid = &row[qid_col].trim();
|
||||
if !qid.is_empty() {
|
||||
match Qid::from_str(qid) {
|
||||
Ok(qid) => {
|
||||
qids.insert(qid);
|
||||
}
|
||||
Err(e) => {
|
||||
let (osm_id, osm_type, osm_version) = parse_metadata();
|
||||
push_error(ParseLineError {
|
||||
kind: e.into(),
|
||||
text: qid.to_string(),
|
||||
line: rdr.position().line(),
|
||||
osm_id,
|
||||
osm_type,
|
||||
osm_version,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let title = &row[title_col].trim();
|
||||
if !title.is_empty() {
|
||||
match Title::from_osm_tag(title) {
|
||||
Ok(title) => {
|
||||
titles.insert(title);
|
||||
}
|
||||
Err(e) => {
|
||||
let (osm_id, osm_type, osm_version) = parse_metadata();
|
||||
push_error(ParseLineError {
|
||||
kind: e.into(),
|
||||
text: title.to_string(),
|
||||
line: rdr.position().line(),
|
||||
osm_id,
|
||||
osm_type,
|
||||
osm_version,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ParseErrorKind {
|
||||
#[error("title")]
|
||||
Title(#[from] ParseTitleError),
|
||||
#[error("QID")]
|
||||
Qid(#[from] ParseQidError),
|
||||
#[error("TSV line")]
|
||||
Tsv(#[from] csv::Error),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ParseLineError {
|
||||
pub kind: ParseErrorKind,
|
||||
pub text: String,
|
||||
pub line: u64,
|
||||
pub osm_id: Option<osm::Id>,
|
||||
pub osm_type: Option<osm::Kind>,
|
||||
pub osm_version: Option<osm::Version>,
|
||||
}
|
||||
|
||||
impl Display for ParseLineError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "on line {}", self.line)?;
|
||||
if let Some(osm_id) = self.osm_id {
|
||||
write!(f, " ({osm_id})")?;
|
||||
}
|
||||
write!(f, ": {} {:?}", self.kind, self.text)?;
|
||||
|
||||
// Write source error chain to ensure they are logged.
|
||||
let mut source = self.kind.source();
|
||||
while let Some(e) = source {
|
||||
write!(f, ": {}", e)?;
|
||||
source = e.source();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for ParseLineError {
|
||||
fn source(&self) -> Option<&(dyn Error + 'static)> {
|
||||
// Return nothing because Display prints source chain.
|
||||
None
|
||||
}
|
||||
}
|
211
src/wm/mod.rs
211
src/wm/mod.rs
|
@ -1,218 +1,7 @@
|
|||
//! Wikimedia types
|
||||
use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, fs, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
|
||||
mod page;
|
||||
pub use page::Page;
|
||||
mod title;
|
||||
pub use title::*;
|
||||
mod qid;
|
||||
pub use qid::*;
|
||||
|
||||
use crate::osm;
|
||||
|
||||
/// Read from a file of urls on each line.
|
||||
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
|
||||
let contents = fs::read_to_string(path.as_ref())?;
|
||||
Ok(contents
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
Qid::from_str(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("on line {line_num}: {line:?}")
|
||||
})
|
||||
})
|
||||
.filter_map(|r| match r {
|
||||
Ok(qid) => Some(qid),
|
||||
Err(e) => {
|
||||
warn!("Could not parse QID: {:#}", e);
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Read article titles from a file of urls on each line.
|
||||
pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
|
||||
let contents = fs::read_to_string(path.as_ref())?;
|
||||
Ok(contents
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
Title::from_url(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("on line {line_num}: {line:?}")
|
||||
})
|
||||
})
|
||||
.filter_map(|r| match r {
|
||||
Ok(qid) => Some(qid),
|
||||
Err(e) => {
|
||||
warn!("Could not parse wikipedia title: {:#}", e);
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
pub fn parse_osm_tag_file(
|
||||
path: impl AsRef<OsStr>,
|
||||
qids: &mut HashSet<Qid>,
|
||||
titles: &mut HashSet<Title>,
|
||||
mut line_errors: Option<&mut Vec<ParseLineError>>,
|
||||
) -> anyhow::Result<()> {
|
||||
let path = path.as_ref();
|
||||
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
|
||||
|
||||
let mut push_error = |e: ParseLineError| {
|
||||
debug!("Tag parse error: {e}");
|
||||
if let Some(ref mut errs) = line_errors {
|
||||
errs.push(e);
|
||||
}
|
||||
};
|
||||
|
||||
let mut qid_col = None;
|
||||
let mut title_col = None;
|
||||
let mut osm_id_col = None;
|
||||
let mut osm_otype_col = None;
|
||||
let mut osm_oname_col = None;
|
||||
let mut osm_version_col = None;
|
||||
for (column, title) in rdr.headers()?.iter().enumerate() {
|
||||
match title {
|
||||
"wikidata" => qid_col = Some(column),
|
||||
"wikipedia" => title_col = Some(column),
|
||||
"@id" => osm_id_col = Some(column),
|
||||
"@otype" => osm_otype_col = Some(column),
|
||||
"@oname" => osm_oname_col = Some(column),
|
||||
"@version" => osm_version_col = Some(column),
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
let qid_col = qid_col.ok_or_else(|| anyhow!("Cannot find 'wikidata' column"))?;
|
||||
let title_col = title_col.ok_or_else(|| anyhow!("Cannot find 'wikipedia' column"))?;
|
||||
|
||||
let mut row = csv::StringRecord::new();
|
||||
loop {
|
||||
match rdr.read_record(&mut row) {
|
||||
Ok(true) => {}
|
||||
// finished
|
||||
Ok(false) => break,
|
||||
// attempt to recover from parsing errors
|
||||
Err(e) => {
|
||||
if e.is_io_error() {
|
||||
bail!(e)
|
||||
}
|
||||
push_error(ParseLineError {
|
||||
kind: e.into(),
|
||||
text: String::new(),
|
||||
line: rdr.position().line(),
|
||||
osm_id: None,
|
||||
osm_type: None,
|
||||
osm_version: None,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let parse_metadata = || {
|
||||
(
|
||||
osm_id_col.and_then(|i| row[i].trim().parse::<osm::Id>().ok()),
|
||||
// Prefer otype, use oname if not available
|
||||
osm_otype_col
|
||||
.and_then(|i| row[i].trim().parse().ok())
|
||||
.and_then(osm::Kind::from_otype)
|
||||
.or_else(|| osm_oname_col.and_then(|i| osm::Kind::from_oname(&row[i]))),
|
||||
osm_version_col.and_then(|i| row[i].trim().parse::<osm::Version>().ok()),
|
||||
)
|
||||
};
|
||||
|
||||
let qid = &row[qid_col].trim();
|
||||
if !qid.is_empty() {
|
||||
match Qid::from_str(qid) {
|
||||
Ok(qid) => {
|
||||
qids.insert(qid);
|
||||
}
|
||||
Err(e) => {
|
||||
let (osm_id, osm_type, osm_version) = parse_metadata();
|
||||
push_error(ParseLineError {
|
||||
kind: e.into(),
|
||||
text: qid.to_string(),
|
||||
line: rdr.position().line(),
|
||||
osm_id,
|
||||
osm_type,
|
||||
osm_version,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let title = &row[title_col].trim();
|
||||
if !title.is_empty() {
|
||||
match Title::from_osm_tag(title) {
|
||||
Ok(title) => {
|
||||
titles.insert(title);
|
||||
}
|
||||
Err(e) => {
|
||||
let (osm_id, osm_type, osm_version) = parse_metadata();
|
||||
push_error(ParseLineError {
|
||||
kind: e.into(),
|
||||
text: title.to_string(),
|
||||
line: rdr.position().line(),
|
||||
osm_id,
|
||||
osm_type,
|
||||
osm_version,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ParseErrorKind {
|
||||
#[error("title")]
|
||||
Title(#[from] ParseTitleError),
|
||||
#[error("QID")]
|
||||
Qid(#[from] ParseQidError),
|
||||
#[error("TSV line")]
|
||||
Tsv(#[from] csv::Error),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ParseLineError {
|
||||
pub kind: ParseErrorKind,
|
||||
pub text: String,
|
||||
pub line: u64,
|
||||
pub osm_id: Option<osm::Id>,
|
||||
pub osm_type: Option<osm::Kind>,
|
||||
pub osm_version: Option<osm::Version>,
|
||||
}
|
||||
|
||||
impl Display for ParseLineError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "on line {}", self.line)?;
|
||||
if let Some(osm_id) = self.osm_id {
|
||||
write!(f, " ({osm_id})")?;
|
||||
}
|
||||
write!(f, ": {} {:?}", self.kind, self.text)?;
|
||||
|
||||
// Write source error chain to ensure they are logged.
|
||||
let mut source = self.kind.source();
|
||||
while let Some(e) = source {
|
||||
write!(f, ": {}", e)?;
|
||||
source = e.source();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for ParseLineError {
|
||||
fn source(&self) -> Option<&(dyn Error + 'static)> {
|
||||
// Return nothing because Display prints source chain.
|
||||
None
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue