Make parse functions collection-agnostic

This makes it possible to use a Set in one place and a Vec in another, to log or count items without allocating a collection for all of them, and to ignore errors with no overhead. The alternative is converting them to custom iterators, which is more work than I want to do right now. Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
2024-03-15 13:30:00 -04:00 · 2024-03-15 13:30:00 -04:00 · e61f12d014
commit e61f12d014
parent d723452ec5
5 changed files with 97 additions and 65 deletions
--- a/src/extend.rs
+++ b/src/extend.rs
@ -0,0 +1,38 @@
+//! Utilities for working with [Extend].
+use std::iter::Extend;
+
+/// Calls `f` for each `Item`.
+///
+/// ```
+/// # use om_wikiparser::extend;
+/// let mut count = 0;
+///
+/// extend::from_fn(|_| count += 1).extend(std::iter::zip(
+///     [1, 2, 3, 4],
+///     ['a', 'b', 'c']));
+/// assert_eq!(count, 3);
+/// ```
+pub fn from_fn<Item, F: FnMut(Item)>(f: F) -> FromFn<F> {
+    FromFn(f)
+}
+
+pub struct FromFn<F>(F);
+impl<Item, F: FnMut(Item)> Extend<Item> for FromFn<F> {
+    fn extend<T: IntoIterator<Item = Item>>(&mut self, iter: T) {
+        for item in iter {
+            self.0(item);
+        }
+    }
+}
+
+/// Iterates but drops each `Item`.
+pub fn sink() -> Sink {
+    Sink(())
+}
+
+pub struct Sink(());
+impl<Item> Extend<Item> for Sink {
+    fn extend<T: IntoIterator<Item = Item>>(&mut self, iter: T) {
+        for _item in iter {}
+    }
+}
--- a/src/get_articles.rs
+++ b/src/get_articles.rs
@ -1,7 +1,8 @@
 use std::{
    borrow::Cow,
+    collections::HashSet,
    fs::{self, File},
-    io::{stdin, stdout, BufRead, Write},
+    io::{stdin, stdout, BufRead, BufReader, Write},
    os::unix,
    path::{Path, PathBuf},
 };
@ -9,6 +10,7 @@ use std::{
 use anyhow::{anyhow, bail, Context};

 use om_wikiparser::{
+    extend,
    html::{self, HtmlError},
    parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file,
    wm::{Page, Title},
@ -67,34 +69,34 @@ pub struct Args {
 }

 pub fn run(args: Args) -> anyhow::Result<()> {
-    let mut wikipedia_titles = if let Some(path) = args.wikipedia_urls {
+    let mut wikipedia_titles = HashSet::new();
+    if let Some(path) = args.wikipedia_urls {
        info!("Loading article urls from {path:?}");
-        parse_wikipedia_file(path)?
-    } else {
-        Default::default()
-    };
+        let file = BufReader::new(File::open(path)?);
+        parse_wikipedia_file(file, &mut wikipedia_titles)?
+    }

-    let mut wikidata_qids = if let Some(path) = args.wikidata_qids {
+    let mut wikidata_qids = HashSet::new();
+    if let Some(path) = args.wikidata_qids {
        info!("Loading wikidata QIDs from {path:?}");
-        parse_wikidata_file(path)?
-    } else {
-        Default::default()
+        let file = BufReader::new(File::open(path)?);
+        parse_wikidata_file(file, &mut wikidata_qids)?
    };

    if let Some(ref path) = args.osm_tags {
        info!("Loading wikipedia/wikidata osm tags from {path:?}");
+        let file = File::open(path)?;

        let original_items = wikidata_qids.len() + wikipedia_titles.len();
-        let mut line_errors = Vec::new();
+        let mut error_count = 0;
        parse_osm_tag_file(
-            path,
+            file,
            &mut wikidata_qids,
            &mut wikipedia_titles,
-            Some(&mut line_errors),
+            &mut extend::from_fn(|_| error_count += 1),
        )?;

-        if !line_errors.is_empty() {
-            let error_count = line_errors.len();
+        if error_count != 0 {
            let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items;
            let percentage = 100.0 * error_count as f64 / new_items as f64;
            warn!("{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",);
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,57 +1,52 @@
-use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr};
+use std::{
+    io::{self, BufRead},
+    str::FromStr,
+};

 #[macro_use]
 extern crate log;
-use anyhow::Context;

 pub mod html;
 pub mod osm;
 mod tag_file;
 pub use tag_file::*;
+pub mod extend;
 pub mod wm;

 use wm::{Qid, Title};

 /// Read from a file of urls on each line.
-pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
-    let contents = fs::read_to_string(path.as_ref())?;
-    Ok(contents
-        .lines()
-        .enumerate()
-        .map(|(i, line)| {
-            Qid::from_str(line).with_context(|| {
-                let line_num = i + 1;
-                format!("on line {line_num}: {line:?}")
-            })
-        })
-        .filter_map(|r| match r {
-            Ok(qid) => Some(qid),
+pub fn parse_wikidata_file(r: impl BufRead, collection: &mut impl Extend<Qid>) -> io::Result<()> {
+    for (i, line) in r.lines().enumerate() {
+        let line = line?;
+        match Qid::from_str(&line) {
+            Ok(qid) => collection.extend(Some(qid)),
            Err(e) => {
-                warn!("Could not parse QID: {:#}", e);
-                None
+                let line_num = i + 1;
+                warn!("Could not parse QID: on line {line_num}: {line:?}: {:#}", e);
            }
-        })
-        .collect())
+        }
+    }
+    Ok(())
 }

 /// Read article titles from a file of urls on each line.
-pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
-    let contents = fs::read_to_string(path.as_ref())?;
-    Ok(contents
-        .lines()
-        .enumerate()
-        .map(|(i, line)| {
-            Title::from_osm_tag(line).with_context(|| {
-                let line_num = i + 1;
-                format!("on line {line_num}: {line:?}")
-            })
-        })
-        .filter_map(|r| match r {
-            Ok(qid) => Some(qid),
+pub fn parse_wikipedia_file(
+    r: impl BufRead,
+    collection: &mut impl Extend<Title>,
+) -> io::Result<()> {
+    for (i, line) in r.lines().enumerate() {
+        let line = line?;
+        match Title::from_osm_tag(&line) {
+            Ok(title) => collection.extend(Some(title)),
            Err(e) => {
-                warn!("Could not parse wikipedia title: {:#}", e);
-                None
+                let line_num = i + 1;
+                warn!(
+                    "Could not parse wikipedia title: on line {line_num}: {line:?}: {:#}",
+                    e
+                );
            }
-        })
-        .collect())
+        }
+    }
+    Ok(())
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -120,7 +120,8 @@ fn main() -> anyhow::Result<()> {
            let mut titles = HashSet::new();
            let mut errors = Vec::new();
            info!("Reading osm tag file");
-            om_wikiparser::parse_osm_tag_file(osm_tags, &mut qids, &mut titles, Some(&mut errors))?;
+            let file = File::open(osm_tags)?;
+            om_wikiparser::parse_osm_tag_file(file, &mut qids, &mut titles, &mut errors)?;
            info!("Found {} errors in tag file", errors.len());

            let mut writer = csv::WriterBuilder::new()
--- a/src/tag_file.rs
+++ b/src/tag_file.rs
@ -1,4 +1,4 @@
-use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, str::FromStr};
+use std::{error::Error, fmt::Display, io::Read, str::FromStr};

 use anyhow::{anyhow, bail};

@ -9,19 +9,15 @@ use crate::{

 /// Read a TSV file of OSM tags, using wikipedia/wikidata tags.
 pub fn parse_osm_tag_file(
-    path: impl AsRef<OsStr>,
-    qids: &mut HashSet<Qid>,
-    titles: &mut HashSet<Title>,
-    mut line_errors: Option<&mut Vec<ParseLineError>>,
+    r: impl Read,
+    qids: &mut impl Extend<Qid>,
+    titles: &mut impl Extend<Title>,
+    line_errors: &mut impl Extend<ParseLineError>,
 ) -> anyhow::Result<()> {
-    let path = path.as_ref();
-    let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
+    let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_reader(r);

    let mut push_error = |e: ParseLineError| {
-        debug!("Tag parse error: {e}");
-        if let Some(ref mut errs) = line_errors {
-            errs.push(e);
-        }
+        line_errors.extend(Some(e));
    };

    let mut qid_col = None;
@ -84,7 +80,7 @@ pub fn parse_osm_tag_file(
        if !qid.is_empty() {
            match Qid::from_str(qid) {
                Ok(qid) => {
-                    qids.insert(qid);
+                    qids.extend(Some(qid));
                }
                Err(e) => {
                    let (osm_id, osm_type, osm_version) = parse_metadata();
@ -104,7 +100,7 @@ pub fn parse_osm_tag_file(
        if !title.is_empty() {
            match Title::from_osm_tag(title) {
                Ok(title) => {
-                    titles.insert(title);
+                    titles.extend(Some(title));
                }
                Err(e) => {
                    let (osm_id, osm_type, osm_version) = parse_metadata();