Make parse functions collection-agnostic
This makes it possible to use a Set in one place and a Vec in another, to log or count items without allocating a collection for all of them, and to ignore errors with no overhead. The alternative is converting them to custom iterators, which is more work than I want to do right now. Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
d723452ec5
commit
e61f12d014
5 changed files with 97 additions and 65 deletions
38
src/extend.rs
Normal file
38
src/extend.rs
Normal file
|
@ -0,0 +1,38 @@
|
|||
//! Utilities for working with [Extend].
|
||||
use std::iter::Extend;
|
||||
|
||||
/// Calls `f` for each `Item`.
|
||||
///
|
||||
/// ```
|
||||
/// # use om_wikiparser::extend;
|
||||
/// let mut count = 0;
|
||||
///
|
||||
/// extend::from_fn(|_| count += 1).extend(std::iter::zip(
|
||||
/// [1, 2, 3, 4],
|
||||
/// ['a', 'b', 'c']));
|
||||
/// assert_eq!(count, 3);
|
||||
/// ```
|
||||
pub fn from_fn<Item, F: FnMut(Item)>(f: F) -> FromFn<F> {
|
||||
FromFn(f)
|
||||
}
|
||||
|
||||
pub struct FromFn<F>(F);
|
||||
impl<Item, F: FnMut(Item)> Extend<Item> for FromFn<F> {
|
||||
fn extend<T: IntoIterator<Item = Item>>(&mut self, iter: T) {
|
||||
for item in iter {
|
||||
self.0(item);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterates but drops each `Item`.
|
||||
pub fn sink() -> Sink {
|
||||
Sink(())
|
||||
}
|
||||
|
||||
pub struct Sink(());
|
||||
impl<Item> Extend<Item> for Sink {
|
||||
fn extend<T: IntoIterator<Item = Item>>(&mut self, iter: T) {
|
||||
for _item in iter {}
|
||||
}
|
||||
}
|
|
@ -1,7 +1,8 @@
|
|||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashSet,
|
||||
fs::{self, File},
|
||||
io::{stdin, stdout, BufRead, Write},
|
||||
io::{stdin, stdout, BufRead, BufReader, Write},
|
||||
os::unix,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
@ -9,6 +10,7 @@ use std::{
|
|||
use anyhow::{anyhow, bail, Context};
|
||||
|
||||
use om_wikiparser::{
|
||||
extend,
|
||||
html::{self, HtmlError},
|
||||
parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file,
|
||||
wm::{Page, Title},
|
||||
|
@ -67,34 +69,34 @@ pub struct Args {
|
|||
}
|
||||
|
||||
pub fn run(args: Args) -> anyhow::Result<()> {
|
||||
let mut wikipedia_titles = if let Some(path) = args.wikipedia_urls {
|
||||
let mut wikipedia_titles = HashSet::new();
|
||||
if let Some(path) = args.wikipedia_urls {
|
||||
info!("Loading article urls from {path:?}");
|
||||
parse_wikipedia_file(path)?
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
let file = BufReader::new(File::open(path)?);
|
||||
parse_wikipedia_file(file, &mut wikipedia_titles)?
|
||||
}
|
||||
|
||||
let mut wikidata_qids = if let Some(path) = args.wikidata_qids {
|
||||
let mut wikidata_qids = HashSet::new();
|
||||
if let Some(path) = args.wikidata_qids {
|
||||
info!("Loading wikidata QIDs from {path:?}");
|
||||
parse_wikidata_file(path)?
|
||||
} else {
|
||||
Default::default()
|
||||
let file = BufReader::new(File::open(path)?);
|
||||
parse_wikidata_file(file, &mut wikidata_qids)?
|
||||
};
|
||||
|
||||
if let Some(ref path) = args.osm_tags {
|
||||
info!("Loading wikipedia/wikidata osm tags from {path:?}");
|
||||
let file = File::open(path)?;
|
||||
|
||||
let original_items = wikidata_qids.len() + wikipedia_titles.len();
|
||||
let mut line_errors = Vec::new();
|
||||
let mut error_count = 0;
|
||||
parse_osm_tag_file(
|
||||
path,
|
||||
file,
|
||||
&mut wikidata_qids,
|
||||
&mut wikipedia_titles,
|
||||
Some(&mut line_errors),
|
||||
&mut extend::from_fn(|_| error_count += 1),
|
||||
)?;
|
||||
|
||||
if !line_errors.is_empty() {
|
||||
let error_count = line_errors.len();
|
||||
if error_count != 0 {
|
||||
let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items;
|
||||
let percentage = 100.0 * error_count as f64 / new_items as f64;
|
||||
warn!("{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",);
|
||||
|
|
67
src/lib.rs
67
src/lib.rs
|
@ -1,57 +1,52 @@
|
|||
use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr};
|
||||
use std::{
|
||||
io::{self, BufRead},
|
||||
str::FromStr,
|
||||
};
|
||||
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
use anyhow::Context;
|
||||
|
||||
pub mod html;
|
||||
pub mod osm;
|
||||
mod tag_file;
|
||||
pub use tag_file::*;
|
||||
pub mod extend;
|
||||
pub mod wm;
|
||||
|
||||
use wm::{Qid, Title};
|
||||
|
||||
/// Read from a file of urls on each line.
|
||||
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
|
||||
let contents = fs::read_to_string(path.as_ref())?;
|
||||
Ok(contents
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
Qid::from_str(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("on line {line_num}: {line:?}")
|
||||
})
|
||||
})
|
||||
.filter_map(|r| match r {
|
||||
Ok(qid) => Some(qid),
|
||||
pub fn parse_wikidata_file(r: impl BufRead, collection: &mut impl Extend<Qid>) -> io::Result<()> {
|
||||
for (i, line) in r.lines().enumerate() {
|
||||
let line = line?;
|
||||
match Qid::from_str(&line) {
|
||||
Ok(qid) => collection.extend(Some(qid)),
|
||||
Err(e) => {
|
||||
warn!("Could not parse QID: {:#}", e);
|
||||
None
|
||||
let line_num = i + 1;
|
||||
warn!("Could not parse QID: on line {line_num}: {line:?}: {:#}", e);
|
||||
}
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Read article titles from a file of urls on each line.
|
||||
pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
|
||||
let contents = fs::read_to_string(path.as_ref())?;
|
||||
Ok(contents
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| {
|
||||
Title::from_osm_tag(line).with_context(|| {
|
||||
let line_num = i + 1;
|
||||
format!("on line {line_num}: {line:?}")
|
||||
})
|
||||
})
|
||||
.filter_map(|r| match r {
|
||||
Ok(qid) => Some(qid),
|
||||
pub fn parse_wikipedia_file(
|
||||
r: impl BufRead,
|
||||
collection: &mut impl Extend<Title>,
|
||||
) -> io::Result<()> {
|
||||
for (i, line) in r.lines().enumerate() {
|
||||
let line = line?;
|
||||
match Title::from_osm_tag(&line) {
|
||||
Ok(title) => collection.extend(Some(title)),
|
||||
Err(e) => {
|
||||
warn!("Could not parse wikipedia title: {:#}", e);
|
||||
None
|
||||
let line_num = i + 1;
|
||||
warn!(
|
||||
"Could not parse wikipedia title: on line {line_num}: {line:?}: {:#}",
|
||||
e
|
||||
);
|
||||
}
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -120,7 +120,8 @@ fn main() -> anyhow::Result<()> {
|
|||
let mut titles = HashSet::new();
|
||||
let mut errors = Vec::new();
|
||||
info!("Reading osm tag file");
|
||||
om_wikiparser::parse_osm_tag_file(osm_tags, &mut qids, &mut titles, Some(&mut errors))?;
|
||||
let file = File::open(osm_tags)?;
|
||||
om_wikiparser::parse_osm_tag_file(file, &mut qids, &mut titles, &mut errors)?;
|
||||
info!("Found {} errors in tag file", errors.len());
|
||||
|
||||
let mut writer = csv::WriterBuilder::new()
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, str::FromStr};
|
||||
use std::{error::Error, fmt::Display, io::Read, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, bail};
|
||||
|
||||
|
@ -9,19 +9,15 @@ use crate::{
|
|||
|
||||
/// Read a TSV file of OSM tags, using wikipedia/wikidata tags.
|
||||
pub fn parse_osm_tag_file(
|
||||
path: impl AsRef<OsStr>,
|
||||
qids: &mut HashSet<Qid>,
|
||||
titles: &mut HashSet<Title>,
|
||||
mut line_errors: Option<&mut Vec<ParseLineError>>,
|
||||
r: impl Read,
|
||||
qids: &mut impl Extend<Qid>,
|
||||
titles: &mut impl Extend<Title>,
|
||||
line_errors: &mut impl Extend<ParseLineError>,
|
||||
) -> anyhow::Result<()> {
|
||||
let path = path.as_ref();
|
||||
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
|
||||
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_reader(r);
|
||||
|
||||
let mut push_error = |e: ParseLineError| {
|
||||
debug!("Tag parse error: {e}");
|
||||
if let Some(ref mut errs) = line_errors {
|
||||
errs.push(e);
|
||||
}
|
||||
line_errors.extend(Some(e));
|
||||
};
|
||||
|
||||
let mut qid_col = None;
|
||||
|
@ -84,7 +80,7 @@ pub fn parse_osm_tag_file(
|
|||
if !qid.is_empty() {
|
||||
match Qid::from_str(qid) {
|
||||
Ok(qid) => {
|
||||
qids.insert(qid);
|
||||
qids.extend(Some(qid));
|
||||
}
|
||||
Err(e) => {
|
||||
let (osm_id, osm_type, osm_version) = parse_metadata();
|
||||
|
@ -104,7 +100,7 @@ pub fn parse_osm_tag_file(
|
|||
if !title.is_empty() {
|
||||
match Title::from_osm_tag(title) {
|
||||
Ok(title) => {
|
||||
titles.insert(title);
|
||||
titles.extend(Some(title));
|
||||
}
|
||||
Err(e) => {
|
||||
let (osm_id, osm_type, osm_version) = parse_metadata();
|
||||
|
|
Loading…
Add table
Reference in a new issue