diff --git a/src/extend.rs b/src/extend.rs new file mode 100644 index 0000000..523c9bb --- /dev/null +++ b/src/extend.rs @@ -0,0 +1,38 @@ +//! Utilities for working with [Extend]. +use std::iter::Extend; + +/// Calls `f` for each `Item`. +/// +/// ``` +/// # use om_wikiparser::extend; +/// let mut count = 0; +/// +/// extend::from_fn(|_| count += 1).extend(std::iter::zip( +/// [1, 2, 3, 4], +/// ['a', 'b', 'c'])); +/// assert_eq!(count, 3); +/// ``` +pub fn from_fn(f: F) -> FromFn { + FromFn(f) +} + +pub struct FromFn(F); +impl Extend for FromFn { + fn extend>(&mut self, iter: T) { + for item in iter { + self.0(item); + } + } +} + +/// Iterates but drops each `Item`. +pub fn sink() -> Sink { + Sink(()) +} + +pub struct Sink(()); +impl Extend for Sink { + fn extend>(&mut self, iter: T) { + for _item in iter {} + } +} diff --git a/src/get_articles.rs b/src/get_articles.rs index a36a1dc..4add8c9 100644 --- a/src/get_articles.rs +++ b/src/get_articles.rs @@ -1,7 +1,8 @@ use std::{ borrow::Cow, + collections::HashSet, fs::{self, File}, - io::{stdin, stdout, BufRead, Write}, + io::{stdin, stdout, BufRead, BufReader, Write}, os::unix, path::{Path, PathBuf}, }; @@ -9,6 +10,7 @@ use std::{ use anyhow::{anyhow, bail, Context}; use om_wikiparser::{ + extend, html::{self, HtmlError}, parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, wm::{Page, Title}, @@ -67,34 +69,34 @@ pub struct Args { } pub fn run(args: Args) -> anyhow::Result<()> { - let mut wikipedia_titles = if let Some(path) = args.wikipedia_urls { + let mut wikipedia_titles = HashSet::new(); + if let Some(path) = args.wikipedia_urls { info!("Loading article urls from {path:?}"); - parse_wikipedia_file(path)? - } else { - Default::default() - }; + let file = BufReader::new(File::open(path)?); + parse_wikipedia_file(file, &mut wikipedia_titles)? + } - let mut wikidata_qids = if let Some(path) = args.wikidata_qids { + let mut wikidata_qids = HashSet::new(); + if let Some(path) = args.wikidata_qids { info!("Loading wikidata QIDs from {path:?}"); - parse_wikidata_file(path)? - } else { - Default::default() + let file = BufReader::new(File::open(path)?); + parse_wikidata_file(file, &mut wikidata_qids)? }; if let Some(ref path) = args.osm_tags { info!("Loading wikipedia/wikidata osm tags from {path:?}"); + let file = File::open(path)?; let original_items = wikidata_qids.len() + wikipedia_titles.len(); - let mut line_errors = Vec::new(); + let mut error_count = 0; parse_osm_tag_file( - path, + file, &mut wikidata_qids, &mut wikipedia_titles, - Some(&mut line_errors), + &mut extend::from_fn(|_| error_count += 1), )?; - if !line_errors.is_empty() { - let error_count = line_errors.len(); + if error_count != 0 { let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items; let percentage = 100.0 * error_count as f64 / new_items as f64; warn!("{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",); diff --git a/src/lib.rs b/src/lib.rs index 41d07ee..06e0f91 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,57 +1,52 @@ -use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr}; +use std::{ + io::{self, BufRead}, + str::FromStr, +}; #[macro_use] extern crate log; -use anyhow::Context; pub mod html; pub mod osm; mod tag_file; pub use tag_file::*; +pub mod extend; pub mod wm; use wm::{Qid, Title}; /// Read from a file of urls on each line. -pub fn parse_wikidata_file(path: impl AsRef) -> anyhow::Result> { - let contents = fs::read_to_string(path.as_ref())?; - Ok(contents - .lines() - .enumerate() - .map(|(i, line)| { - Qid::from_str(line).with_context(|| { - let line_num = i + 1; - format!("on line {line_num}: {line:?}") - }) - }) - .filter_map(|r| match r { - Ok(qid) => Some(qid), +pub fn parse_wikidata_file(r: impl BufRead, collection: &mut impl Extend) -> io::Result<()> { + for (i, line) in r.lines().enumerate() { + let line = line?; + match Qid::from_str(&line) { + Ok(qid) => collection.extend(Some(qid)), Err(e) => { - warn!("Could not parse QID: {:#}", e); - None + let line_num = i + 1; + warn!("Could not parse QID: on line {line_num}: {line:?}: {:#}", e); } - }) - .collect()) + } + } + Ok(()) } /// Read article titles from a file of urls on each line. -pub fn parse_wikipedia_file(path: impl AsRef) -> anyhow::Result> { - let contents = fs::read_to_string(path.as_ref())?; - Ok(contents - .lines() - .enumerate() - .map(|(i, line)| { - Title::from_osm_tag(line).with_context(|| { - let line_num = i + 1; - format!("on line {line_num}: {line:?}") - }) - }) - .filter_map(|r| match r { - Ok(qid) => Some(qid), +pub fn parse_wikipedia_file( + r: impl BufRead, + collection: &mut impl Extend, +) -> io::Result<()> { + for (i, line) in r.lines().enumerate() { + let line = line?; + match Title::from_osm_tag(&line) { + Ok(title) => collection.extend(Some(title)), Err(e) => { - warn!("Could not parse wikipedia title: {:#}", e); - None + let line_num = i + 1; + warn!( + "Could not parse wikipedia title: on line {line_num}: {line:?}: {:#}", + e + ); } - }) - .collect()) + } + } + Ok(()) } diff --git a/src/main.rs b/src/main.rs index e1339c4..5ac7301 100644 --- a/src/main.rs +++ b/src/main.rs @@ -120,7 +120,8 @@ fn main() -> anyhow::Result<()> { let mut titles = HashSet::new(); let mut errors = Vec::new(); info!("Reading osm tag file"); - om_wikiparser::parse_osm_tag_file(osm_tags, &mut qids, &mut titles, Some(&mut errors))?; + let file = File::open(osm_tags)?; + om_wikiparser::parse_osm_tag_file(file, &mut qids, &mut titles, &mut errors)?; info!("Found {} errors in tag file", errors.len()); let mut writer = csv::WriterBuilder::new() diff --git a/src/tag_file.rs b/src/tag_file.rs index 93d6415..0111f0e 100644 --- a/src/tag_file.rs +++ b/src/tag_file.rs @@ -1,4 +1,4 @@ -use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, str::FromStr}; +use std::{error::Error, fmt::Display, io::Read, str::FromStr}; use anyhow::{anyhow, bail}; @@ -9,19 +9,15 @@ use crate::{ /// Read a TSV file of OSM tags, using wikipedia/wikidata tags. pub fn parse_osm_tag_file( - path: impl AsRef<OsStr>, - qids: &mut HashSet<Qid>, - titles: &mut HashSet<Title>, - mut line_errors: Option<&mut Vec<ParseLineError>>, + r: impl Read, + qids: &mut impl Extend<Qid>, + titles: &mut impl Extend<Title>, + line_errors: &mut impl Extend<ParseLineError>, ) -> anyhow::Result<()> { - let path = path.as_ref(); - let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?; + let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_reader(r); let mut push_error = |e: ParseLineError| { - debug!("Tag parse error: {e}"); - if let Some(ref mut errs) = line_errors { - errs.push(e); - } + line_errors.extend(Some(e)); }; let mut qid_col = None; @@ -84,7 +80,7 @@ pub fn parse_osm_tag_file( if !qid.is_empty() { match Qid::from_str(qid) { Ok(qid) => { - qids.insert(qid); + qids.extend(Some(qid)); } Err(e) => { let (osm_id, osm_type, osm_version) = parse_metadata(); @@ -104,7 +100,7 @@ pub fn parse_osm_tag_file( if !title.is_empty() { match Title::from_osm_tag(title) { Ok(title) => { - titles.insert(title); + titles.extend(Some(title)); } Err(e) => { let (osm_id, osm_type, osm_version) = parse_metadata();