Add new option to parse osm tag file

Parse wikipedia and wikidata tags from a tsv file of OSM tags, compatible with the "--csv" output of `osmconvert`. Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
2023-08-02 18:55:53 -04:00 · 2023-08-02 18:55:53 -04:00 · a2c113a885
commit a2c113a885
parent 0fc43767aa
4 changed files with 114 additions and 11 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -222,6 +222,27 @@ dependencies = [
 "syn 1.0.109",
 ]

+[[package]]
+name = "csv"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "626ae34994d3d8d668f4269922248239db4ae42d538b14c398b74a52208e8086"
+dependencies = [
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "derive_more"
 version = "0.99.17"
@ -522,6 +543,7 @@ version = "0.0.0"
 dependencies = [
 "anyhow",
 "clap",
+ "csv",
 "ego-tree",
 "env_logger",
 "log",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -10,6 +10,7 @@ default-run = "om-wikiparser"
 [dependencies]
 anyhow = { version = "1.0.71", features = ["backtrace"] }
 clap = { version = "4.3.2", features = ["derive"] }
+csv = "1.2.2"
 ego-tree = "0.6.2"
 env_logger = "0.10.0"
 log = "0.4.18"
--- a/src/main.rs
+++ b/src/main.rs
@ -12,7 +12,7 @@ extern crate log;

 use om_wikiparser::{
    html::simplify,
-    wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
+    wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
 };

 /// Get the version returned by `git describe`, e.g.:
@ -37,6 +37,12 @@ struct Args {
    /// Directory to write the extracted articles to.
    output_dir: PathBuf,

+    /// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
+    ///
+    /// This can be generated with `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
+    #[arg(long, help_heading = "FILTERS")]
+    osm_tags: Option<PathBuf>,
+
    /// Path to file that contains a Wikidata QID to extract on each line
    /// (e.g. `Q12345`).
    #[arg(long, help_heading = "FILTERS")]
@ -178,35 +184,39 @@ fn main() -> anyhow::Result<()> {

    let args = Args::parse();

-    if args.wikidata_ids.is_none() && args.wikipedia_urls.is_none() {
+    if args.wikidata_ids.is_none() && args.wikipedia_urls.is_none() && args.osm_tags.is_none() {
        let mut cmd = Args::command();
        cmd.error(
            clap::error::ErrorKind::MissingRequiredArgument,
-            "one or both of --wikidata-ids and --wikipedia-urls is required",
+            "at least one --osm-tags --wikidata-ids --wikipedia-urls is required",
        )
        .exit()
    }

    info!("{} {}", Args::command().get_name(), version());

-    let wikipedia_titles = if let Some(path) = args.wikipedia_urls {
+    let mut wikipedia_titles = if let Some(path) = args.wikipedia_urls {
        info!("Loading article urls from {path:?}");
-        let urls = parse_wikipedia_file(path)?;
-        debug!("Parsed {} unique article urls", urls.len());
-        urls
+        parse_wikipedia_file(path)?
    } else {
        Default::default()
    };

-    let wikidata_ids = if let Some(path) = args.wikidata_ids {
+    let mut wikidata_ids = if let Some(path) = args.wikidata_ids {
        info!("Loading wikidata ids from {path:?}");
-        let ids = parse_wikidata_file(path)?;
-        debug!("Parsed {} unique wikidata ids", ids.len());
-        ids
+        parse_wikidata_file(path)?
    } else {
        Default::default()
    };

+    if let Some(path) = args.osm_tags {
+        info!("Loading wikipedia/wikidata osm tags from {path:?}");
+        parse_osm_tag_file(path, &mut wikidata_ids, &mut wikipedia_titles)?;
+    }
+
+    debug!("Parsed {} unique article urls", wikipedia_titles.len());
+    debug!("Parsed {} unique wikidata ids", wikidata_ids.len());
+
    // NOTE: For atomic writes to the same file across threads/processes:
    // - The file needs to be opened in APPEND mode (`.append(true)`).
    // - Each write needs to be a single syscall (for Rust, use `format!` for formatting before calling `write!`, or `write!` to a `String` first).
--- a/src/wm/mod.rs
+++ b/src/wm/mod.rs
@ -57,6 +57,76 @@ pub fn parse_wikipedia_file(
        .collect())
 }

+pub fn parse_osm_tag_file(
+    path: impl AsRef<OsStr>,
+    qids: &mut HashSet<WikidataQid>,
+    titles: &mut HashSet<WikipediaTitleNorm>,
+) -> anyhow::Result<()> {
+    let path = path.as_ref();
+    let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
+
+    let mut qid_col = None;
+    let mut title_col = None;
+    for (column, title) in rdr.headers()?.iter().enumerate() {
+        match title {
+            "wikidata" => qid_col = Some(column),
+            "wikipedia" => title_col = Some(column),
+            _ => (),
+        }
+    }
+
+    let qid_col = qid_col.ok_or_else(|| anyhow!("Cannot find 'wikidata' column"))?;
+    let title_col = title_col.ok_or_else(|| anyhow!("Cannot find 'wikipedia' column"))?;
+
+    let mut row = csv::StringRecord::new();
+    loop {
+        match rdr.read_record(&mut row) {
+            Ok(true) => {}
+            // finished
+            Ok(false) => break,
+            // attempt to recover from parsing errors
+            Err(e) => {
+                error!("Error parsing tsv file: {}", e);
+                continue;
+            }
+        }
+
+        let qid = &row[qid_col].trim();
+        if !qid.is_empty() {
+            match WikidataQid::from_str(qid) {
+                Ok(qid) => {
+                    qids.insert(qid);
+                }
+                Err(e) => warn!(
+                    "Cannot parse qid {:?} on line {} in {:?}: {}",
+                    qid,
+                    rdr.position().line(),
+                    path,
+                    e
+                ),
+            }
+        }
+
+        let title = &row[title_col].trim();
+        if !title.is_empty() {
+            match WikipediaTitleNorm::_from_osm_tag(title) {
+                Ok(title) => {
+                    titles.insert(title);
+                }
+                Err(e) => warn!(
+                    "Cannot parse title {:?} on line {} in {:?}: {}",
+                    title,
+                    rdr.position().line(),
+                    path,
+                    e
+                ),
+            }
+        }
+    }
+
+    Ok(())
+}
+
 /// Wikidata QID/Q Number
 ///
 /// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID