Add new option to parse osm tag file

Parse wikipedia and wikidata tags from a tsv file of OSM tags,
compatible with the "--csv" output of `osmconvert`.

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-08-02 18:55:53 -04:00
parent 0fc43767aa
commit a2c113a885
4 changed files with 114 additions and 11 deletions

22
Cargo.lock generated
View file

@ -222,6 +222,27 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "csv"
version = "1.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "626ae34994d3d8d668f4269922248239db4ae42d538b14c398b74a52208e8086"
dependencies = [
"csv-core",
"itoa",
"ryu",
"serde",
]
[[package]]
name = "csv-core"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
dependencies = [
"memchr",
]
[[package]]
name = "derive_more"
version = "0.99.17"
@ -522,6 +543,7 @@ version = "0.0.0"
dependencies = [
"anyhow",
"clap",
"csv",
"ego-tree",
"env_logger",
"log",

View file

@ -10,6 +10,7 @@ default-run = "om-wikiparser"
[dependencies]
anyhow = { version = "1.0.71", features = ["backtrace"] }
clap = { version = "4.3.2", features = ["derive"] }
csv = "1.2.2"
ego-tree = "0.6.2"
env_logger = "0.10.0"
log = "0.4.18"

View file

@ -12,7 +12,7 @@ extern crate log;
use om_wikiparser::{
html::simplify,
wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
};
/// Get the version returned by `git describe`, e.g.:
@ -37,6 +37,12 @@ struct Args {
/// Directory to write the extracted articles to.
output_dir: PathBuf,
/// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
///
/// This can be generated with `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
#[arg(long, help_heading = "FILTERS")]
osm_tags: Option<PathBuf>,
/// Path to file that contains a Wikidata QID to extract on each line
/// (e.g. `Q12345`).
#[arg(long, help_heading = "FILTERS")]
@ -178,35 +184,39 @@ fn main() -> anyhow::Result<()> {
let args = Args::parse();
if args.wikidata_ids.is_none() && args.wikipedia_urls.is_none() {
if args.wikidata_ids.is_none() && args.wikipedia_urls.is_none() && args.osm_tags.is_none() {
let mut cmd = Args::command();
cmd.error(
clap::error::ErrorKind::MissingRequiredArgument,
"one or both of --wikidata-ids and --wikipedia-urls is required",
"at least one --osm-tags --wikidata-ids --wikipedia-urls is required",
)
.exit()
}
info!("{} {}", Args::command().get_name(), version());
let wikipedia_titles = if let Some(path) = args.wikipedia_urls {
let mut wikipedia_titles = if let Some(path) = args.wikipedia_urls {
info!("Loading article urls from {path:?}");
let urls = parse_wikipedia_file(path)?;
debug!("Parsed {} unique article urls", urls.len());
urls
parse_wikipedia_file(path)?
} else {
Default::default()
};
let wikidata_ids = if let Some(path) = args.wikidata_ids {
let mut wikidata_ids = if let Some(path) = args.wikidata_ids {
info!("Loading wikidata ids from {path:?}");
let ids = parse_wikidata_file(path)?;
debug!("Parsed {} unique wikidata ids", ids.len());
ids
parse_wikidata_file(path)?
} else {
Default::default()
};
if let Some(path) = args.osm_tags {
info!("Loading wikipedia/wikidata osm tags from {path:?}");
parse_osm_tag_file(path, &mut wikidata_ids, &mut wikipedia_titles)?;
}
debug!("Parsed {} unique article urls", wikipedia_titles.len());
debug!("Parsed {} unique wikidata ids", wikidata_ids.len());
// NOTE: For atomic writes to the same file across threads/processes:
// - The file needs to be opened in APPEND mode (`.append(true)`).
// - Each write needs to be a single syscall (for Rust, use `format!` for formatting before calling `write!`, or `write!` to a `String` first).

View file

@ -57,6 +57,76 @@ pub fn parse_wikipedia_file(
.collect())
}
pub fn parse_osm_tag_file(
path: impl AsRef<OsStr>,
qids: &mut HashSet<WikidataQid>,
titles: &mut HashSet<WikipediaTitleNorm>,
) -> anyhow::Result<()> {
let path = path.as_ref();
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
let mut qid_col = None;
let mut title_col = None;
for (column, title) in rdr.headers()?.iter().enumerate() {
match title {
"wikidata" => qid_col = Some(column),
"wikipedia" => title_col = Some(column),
_ => (),
}
}
let qid_col = qid_col.ok_or_else(|| anyhow!("Cannot find 'wikidata' column"))?;
let title_col = title_col.ok_or_else(|| anyhow!("Cannot find 'wikipedia' column"))?;
let mut row = csv::StringRecord::new();
loop {
match rdr.read_record(&mut row) {
Ok(true) => {}
// finished
Ok(false) => break,
// attempt to recover from parsing errors
Err(e) => {
error!("Error parsing tsv file: {}", e);
continue;
}
}
let qid = &row[qid_col].trim();
if !qid.is_empty() {
match WikidataQid::from_str(qid) {
Ok(qid) => {
qids.insert(qid);
}
Err(e) => warn!(
"Cannot parse qid {:?} on line {} in {:?}: {}",
qid,
rdr.position().line(),
path,
e
),
}
}
let title = &row[title_col].trim();
if !title.is_empty() {
match WikipediaTitleNorm::_from_osm_tag(title) {
Ok(title) => {
titles.insert(title);
}
Err(e) => warn!(
"Cannot parse title {:?} on line {} in {:?}: {}",
title,
rdr.position().line(),
path,
e
),
}
}
}
Ok(())
}
/// Wikidata QID/Q Number
///
/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID