Add new option to parse osm tag file
Parse wikipedia and wikidata tags from a tsv file of OSM tags, compatible with the "--csv" output of `osmconvert`. Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
parent
0fc43767aa
commit
a2c113a885
4 changed files with 114 additions and 11 deletions
22
Cargo.lock
generated
22
Cargo.lock
generated
|
@ -222,6 +222,27 @@ dependencies = [
|
|||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv"
|
||||
version = "1.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "626ae34994d3d8d668f4269922248239db4ae42d538b14c398b74a52208e8086"
|
||||
dependencies = [
|
||||
"csv-core",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv-core"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more"
|
||||
version = "0.99.17"
|
||||
|
@ -522,6 +543,7 @@ version = "0.0.0"
|
|||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"csv",
|
||||
"ego-tree",
|
||||
"env_logger",
|
||||
"log",
|
||||
|
|
|
@ -10,6 +10,7 @@ default-run = "om-wikiparser"
|
|||
[dependencies]
|
||||
anyhow = { version = "1.0.71", features = ["backtrace"] }
|
||||
clap = { version = "4.3.2", features = ["derive"] }
|
||||
csv = "1.2.2"
|
||||
ego-tree = "0.6.2"
|
||||
env_logger = "0.10.0"
|
||||
log = "0.4.18"
|
||||
|
|
32
src/main.rs
32
src/main.rs
|
@ -12,7 +12,7 @@ extern crate log;
|
|||
|
||||
use om_wikiparser::{
|
||||
html::simplify,
|
||||
wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
|
||||
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
|
||||
};
|
||||
|
||||
/// Get the version returned by `git describe`, e.g.:
|
||||
|
@ -37,6 +37,12 @@ struct Args {
|
|||
/// Directory to write the extracted articles to.
|
||||
output_dir: PathBuf,
|
||||
|
||||
/// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
|
||||
///
|
||||
/// This can be generated with `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
|
||||
#[arg(long, help_heading = "FILTERS")]
|
||||
osm_tags: Option<PathBuf>,
|
||||
|
||||
/// Path to file that contains a Wikidata QID to extract on each line
|
||||
/// (e.g. `Q12345`).
|
||||
#[arg(long, help_heading = "FILTERS")]
|
||||
|
@ -178,35 +184,39 @@ fn main() -> anyhow::Result<()> {
|
|||
|
||||
let args = Args::parse();
|
||||
|
||||
if args.wikidata_ids.is_none() && args.wikipedia_urls.is_none() {
|
||||
if args.wikidata_ids.is_none() && args.wikipedia_urls.is_none() && args.osm_tags.is_none() {
|
||||
let mut cmd = Args::command();
|
||||
cmd.error(
|
||||
clap::error::ErrorKind::MissingRequiredArgument,
|
||||
"one or both of --wikidata-ids and --wikipedia-urls is required",
|
||||
"at least one --osm-tags --wikidata-ids --wikipedia-urls is required",
|
||||
)
|
||||
.exit()
|
||||
}
|
||||
|
||||
info!("{} {}", Args::command().get_name(), version());
|
||||
|
||||
let wikipedia_titles = if let Some(path) = args.wikipedia_urls {
|
||||
let mut wikipedia_titles = if let Some(path) = args.wikipedia_urls {
|
||||
info!("Loading article urls from {path:?}");
|
||||
let urls = parse_wikipedia_file(path)?;
|
||||
debug!("Parsed {} unique article urls", urls.len());
|
||||
urls
|
||||
parse_wikipedia_file(path)?
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
let wikidata_ids = if let Some(path) = args.wikidata_ids {
|
||||
let mut wikidata_ids = if let Some(path) = args.wikidata_ids {
|
||||
info!("Loading wikidata ids from {path:?}");
|
||||
let ids = parse_wikidata_file(path)?;
|
||||
debug!("Parsed {} unique wikidata ids", ids.len());
|
||||
ids
|
||||
parse_wikidata_file(path)?
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
if let Some(path) = args.osm_tags {
|
||||
info!("Loading wikipedia/wikidata osm tags from {path:?}");
|
||||
parse_osm_tag_file(path, &mut wikidata_ids, &mut wikipedia_titles)?;
|
||||
}
|
||||
|
||||
debug!("Parsed {} unique article urls", wikipedia_titles.len());
|
||||
debug!("Parsed {} unique wikidata ids", wikidata_ids.len());
|
||||
|
||||
// NOTE: For atomic writes to the same file across threads/processes:
|
||||
// - The file needs to be opened in APPEND mode (`.append(true)`).
|
||||
// - Each write needs to be a single syscall (for Rust, use `format!` for formatting before calling `write!`, or `write!` to a `String` first).
|
||||
|
|
|
@ -57,6 +57,76 @@ pub fn parse_wikipedia_file(
|
|||
.collect())
|
||||
}
|
||||
|
||||
pub fn parse_osm_tag_file(
|
||||
path: impl AsRef<OsStr>,
|
||||
qids: &mut HashSet<WikidataQid>,
|
||||
titles: &mut HashSet<WikipediaTitleNorm>,
|
||||
) -> anyhow::Result<()> {
|
||||
let path = path.as_ref();
|
||||
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
|
||||
|
||||
let mut qid_col = None;
|
||||
let mut title_col = None;
|
||||
for (column, title) in rdr.headers()?.iter().enumerate() {
|
||||
match title {
|
||||
"wikidata" => qid_col = Some(column),
|
||||
"wikipedia" => title_col = Some(column),
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
let qid_col = qid_col.ok_or_else(|| anyhow!("Cannot find 'wikidata' column"))?;
|
||||
let title_col = title_col.ok_or_else(|| anyhow!("Cannot find 'wikipedia' column"))?;
|
||||
|
||||
let mut row = csv::StringRecord::new();
|
||||
loop {
|
||||
match rdr.read_record(&mut row) {
|
||||
Ok(true) => {}
|
||||
// finished
|
||||
Ok(false) => break,
|
||||
// attempt to recover from parsing errors
|
||||
Err(e) => {
|
||||
error!("Error parsing tsv file: {}", e);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let qid = &row[qid_col].trim();
|
||||
if !qid.is_empty() {
|
||||
match WikidataQid::from_str(qid) {
|
||||
Ok(qid) => {
|
||||
qids.insert(qid);
|
||||
}
|
||||
Err(e) => warn!(
|
||||
"Cannot parse qid {:?} on line {} in {:?}: {}",
|
||||
qid,
|
||||
rdr.position().line(),
|
||||
path,
|
||||
e
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
let title = &row[title_col].trim();
|
||||
if !title.is_empty() {
|
||||
match WikipediaTitleNorm::_from_osm_tag(title) {
|
||||
Ok(title) => {
|
||||
titles.insert(title);
|
||||
}
|
||||
Err(e) => warn!(
|
||||
"Cannot parse title {:?} on line {} in {:?}: {}",
|
||||
title,
|
||||
rdr.position().line(),
|
||||
path,
|
||||
e
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Wikidata QID/Q Number
|
||||
///
|
||||
/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
|
||||
|
|
Loading…
Add table
Reference in a new issue