Make parse functions collection-agnostic

This makes it possible to use a Set in one place and a Vec in another,
to log or count items without allocating a collection for all of them,
and to ignore errors with no overhead.

The alternative is converting them to custom iterators, which is more work
than I want to do right now.

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2024-03-15 13:30:00 -04:00 committed by Evan Lloyd New-Schmidt
parent d723452ec5
commit e61f12d014
5 changed files with 97 additions and 65 deletions

38
src/extend.rs Normal file
View file

@ -0,0 +1,38 @@
//! Utilities for working with [Extend].
use std::iter::Extend;
/// Calls `f` for each `Item`.
///
/// ```
/// # use om_wikiparser::extend;
/// let mut count = 0;
///
/// extend::from_fn(|_| count += 1).extend(std::iter::zip(
/// [1, 2, 3, 4],
/// ['a', 'b', 'c']));
/// assert_eq!(count, 3);
/// ```
pub fn from_fn<Item, F: FnMut(Item)>(f: F) -> FromFn<F> {
FromFn(f)
}
pub struct FromFn<F>(F);
impl<Item, F: FnMut(Item)> Extend<Item> for FromFn<F> {
fn extend<T: IntoIterator<Item = Item>>(&mut self, iter: T) {
for item in iter {
self.0(item);
}
}
}
/// Iterates but drops each `Item`.
pub fn sink() -> Sink {
Sink(())
}
pub struct Sink(());
impl<Item> Extend<Item> for Sink {
fn extend<T: IntoIterator<Item = Item>>(&mut self, iter: T) {
for _item in iter {}
}
}

View file

@ -1,7 +1,8 @@
use std::{
borrow::Cow,
collections::HashSet,
fs::{self, File},
io::{stdin, stdout, BufRead, Write},
io::{stdin, stdout, BufRead, BufReader, Write},
os::unix,
path::{Path, PathBuf},
};
@ -9,6 +10,7 @@ use std::{
use anyhow::{anyhow, bail, Context};
use om_wikiparser::{
extend,
html::{self, HtmlError},
parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file,
wm::{Page, Title},
@ -67,34 +69,34 @@ pub struct Args {
}
pub fn run(args: Args) -> anyhow::Result<()> {
let mut wikipedia_titles = if let Some(path) = args.wikipedia_urls {
let mut wikipedia_titles = HashSet::new();
if let Some(path) = args.wikipedia_urls {
info!("Loading article urls from {path:?}");
parse_wikipedia_file(path)?
} else {
Default::default()
};
let file = BufReader::new(File::open(path)?);
parse_wikipedia_file(file, &mut wikipedia_titles)?
}
let mut wikidata_qids = if let Some(path) = args.wikidata_qids {
let mut wikidata_qids = HashSet::new();
if let Some(path) = args.wikidata_qids {
info!("Loading wikidata QIDs from {path:?}");
parse_wikidata_file(path)?
} else {
Default::default()
let file = BufReader::new(File::open(path)?);
parse_wikidata_file(file, &mut wikidata_qids)?
};
if let Some(ref path) = args.osm_tags {
info!("Loading wikipedia/wikidata osm tags from {path:?}");
let file = File::open(path)?;
let original_items = wikidata_qids.len() + wikipedia_titles.len();
let mut line_errors = Vec::new();
let mut error_count = 0;
parse_osm_tag_file(
path,
file,
&mut wikidata_qids,
&mut wikipedia_titles,
Some(&mut line_errors),
&mut extend::from_fn(|_| error_count += 1),
)?;
if !line_errors.is_empty() {
let error_count = line_errors.len();
if error_count != 0 {
let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items;
let percentage = 100.0 * error_count as f64 / new_items as f64;
warn!("{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",);

View file

@ -1,57 +1,52 @@
use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr};
use std::{
io::{self, BufRead},
str::FromStr,
};
#[macro_use]
extern crate log;
use anyhow::Context;
pub mod html;
pub mod osm;
mod tag_file;
pub use tag_file::*;
pub mod extend;
pub mod wm;
use wm::{Qid, Title};
/// Read from a file of urls on each line.
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
let contents = fs::read_to_string(path.as_ref())?;
Ok(contents
.lines()
.enumerate()
.map(|(i, line)| {
Qid::from_str(line).with_context(|| {
let line_num = i + 1;
format!("on line {line_num}: {line:?}")
})
})
.filter_map(|r| match r {
Ok(qid) => Some(qid),
pub fn parse_wikidata_file(r: impl BufRead, collection: &mut impl Extend<Qid>) -> io::Result<()> {
for (i, line) in r.lines().enumerate() {
let line = line?;
match Qid::from_str(&line) {
Ok(qid) => collection.extend(Some(qid)),
Err(e) => {
warn!("Could not parse QID: {:#}", e);
None
let line_num = i + 1;
warn!("Could not parse QID: on line {line_num}: {line:?}: {:#}", e);
}
})
.collect())
}
}
Ok(())
}
/// Read article titles from a file of urls on each line.
pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
let contents = fs::read_to_string(path.as_ref())?;
Ok(contents
.lines()
.enumerate()
.map(|(i, line)| {
Title::from_osm_tag(line).with_context(|| {
let line_num = i + 1;
format!("on line {line_num}: {line:?}")
})
})
.filter_map(|r| match r {
Ok(qid) => Some(qid),
pub fn parse_wikipedia_file(
r: impl BufRead,
collection: &mut impl Extend<Title>,
) -> io::Result<()> {
for (i, line) in r.lines().enumerate() {
let line = line?;
match Title::from_osm_tag(&line) {
Ok(title) => collection.extend(Some(title)),
Err(e) => {
warn!("Could not parse wikipedia title: {:#}", e);
None
let line_num = i + 1;
warn!(
"Could not parse wikipedia title: on line {line_num}: {line:?}: {:#}",
e
);
}
})
.collect())
}
}
Ok(())
}

View file

@ -120,7 +120,8 @@ fn main() -> anyhow::Result<()> {
let mut titles = HashSet::new();
let mut errors = Vec::new();
info!("Reading osm tag file");
om_wikiparser::parse_osm_tag_file(osm_tags, &mut qids, &mut titles, Some(&mut errors))?;
let file = File::open(osm_tags)?;
om_wikiparser::parse_osm_tag_file(file, &mut qids, &mut titles, &mut errors)?;
info!("Found {} errors in tag file", errors.len());
let mut writer = csv::WriterBuilder::new()

View file

@ -1,4 +1,4 @@
use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, str::FromStr};
use std::{error::Error, fmt::Display, io::Read, str::FromStr};
use anyhow::{anyhow, bail};
@ -9,19 +9,15 @@ use crate::{
/// Read a TSV file of OSM tags, using wikipedia/wikidata tags.
pub fn parse_osm_tag_file(
path: impl AsRef<OsStr>,
qids: &mut HashSet<Qid>,
titles: &mut HashSet<Title>,
mut line_errors: Option<&mut Vec<ParseLineError>>,
r: impl Read,
qids: &mut impl Extend<Qid>,
titles: &mut impl Extend<Title>,
line_errors: &mut impl Extend<ParseLineError>,
) -> anyhow::Result<()> {
let path = path.as_ref();
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_reader(r);
let mut push_error = |e: ParseLineError| {
debug!("Tag parse error: {e}");
if let Some(ref mut errs) = line_errors {
errs.push(e);
}
line_errors.extend(Some(e));
};
let mut qid_col = None;
@ -84,7 +80,7 @@ pub fn parse_osm_tag_file(
if !qid.is_empty() {
match Qid::from_str(qid) {
Ok(qid) => {
qids.insert(qid);
qids.extend(Some(qid));
}
Err(e) => {
let (osm_id, osm_type, osm_version) = parse_metadata();
@ -104,7 +100,7 @@ pub fn parse_osm_tag_file(
if !title.is_empty() {
match Title::from_osm_tag(title) {
Ok(title) => {
titles.insert(title);
titles.extend(Some(title));
}
Err(e) => {
let (osm_id, osm_type, osm_version) = parse_metadata();