From 292eeac081b6380bc41f37fa7833f77218d0d9a4 Mon Sep 17 00:00:00 2001
From: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
Date: Thu, 24 Aug 2023 15:42:19 -0400
Subject: [PATCH] Add command to write tag errors to file

- Write a TSV file with the line number, error, and input text.
- Include OSM object id if available in tag file.
- Update run script to write file once before extracting.

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
---
 README.md       | 18 ++++++++++-----
 run.sh          |  3 +++
 src/main.rs     | 59 +++++++++++++++++++++++++++++++++++++++++++++----
 src/wm/mod.rs   | 31 ++++++++++++++++++--------
 src/wm/title.rs | 24 +++++++++++++-------
 5 files changed, 109 insertions(+), 26 deletions(-)
diff --git a/README.md b/README.md
index 666597d..1184142 100644
--- a/README.md
+++ b/README.md
@@ -65,25 +65,30 @@ Run the program with the `--help` flag to see all supported arguments.
 
 ```
 $ cargo run --release -- --help
-Extract articles from Wikipedia Enterprise HTML dumps
+A set of tools to extract articles from Wikipedia Enterprise HTML dumps selected by OpenStreetMap tags.
 
 Usage: om-wikiparser <COMMAND>
 
 Commands:
-  get-articles  Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps
   get-tags      Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump
-  simplify      Apply the same html article simplification used when extracting articles to stdin, and write it to stdout
+  check-tags    Attempt to parse extracted OSM tags and write errors to stdout in TSV format
+  get-articles  Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps
+  simplify      Apply html simplification to a single article
   help          Print this message or the help of the given subcommand(s)
 
 Options:
-  -h, --help     Print help (see more with '--help')
-  -V, --version  Print version
+  -h, --help
+          Print help (see a summary with '-h')
+
+  -V, --version
+          Print version
 ```
 
 Each command has its own additional help:
 
 ```
 $ cargo run -- get-articles --help
+
 Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
 
 Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
@@ -100,6 +105,9 @@ Options:
 
           Use this to save the QIDs of articles you know the url of, but not the QID. The same path can later be passed to the `--wikidata-qids` option to extract them from another language's dump. Writes are atomicly appended to the file, so the same path may be used by multiple concurrent instances.
 
+      --no-simplify
+          Don't process extracted HTML; write the original text to disk
+
   -h, --help
           Print help (see a summary with '-h')
 
diff --git a/run.sh b/run.sh
index 6b56306..910f9ee 100755
--- a/run.sh
+++ b/run.sh
@@ -100,6 +100,9 @@ cd "$BUILD_DIR"
 log "Extracting tags from '$OSM_FILE'"
 "$wikiparser" get-tags "$OSM_FILE" > osm_tags.tsv
 
+log "Writing tag parse errors to $BUILD_DIR/osm_tags_errors.tsv"
+"$wikiparser" check-tags osm_tags.tsv > osm_tags_errors.tsv
+
 # Enable backtraces in errors and panics.
 # NOTE: Backtraces are still printed for panics that are caught higher in the stack.
 # export RUST_BACKTRACE=1
diff --git a/src/main.rs b/src/main.rs
index ea6d1fc..9804a83 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,5 @@
 use std::{
+    collections::HashSet,
     env,
     fs::File,
     io::{stdin, stdout, BufReader, Read, Write},
@@ -17,7 +18,7 @@ extern crate log;
 mod get_articles;
 mod get_tags;
 
-/// Extract articles from Wikipedia Enterprise HTML dumps.
+/// A set of tools to extract articles from Wikipedia Enterprise HTML dumps selected by OpenStreetMap tags.
 #[derive(Parser)]
 #[command(author, version, about, long_about, version = crate::version())]
 struct Args {
@@ -27,11 +28,10 @@ struct Args {
 
 #[derive(Subcommand)]
 enum Cmd {
-    GetArticles(get_articles::Args),
-
     /// Extract wikidata/wikipedia tags from an OpenStreetMap PBF dump.
     ///
     /// Writes to stdout the extracted tags in a TSV format similar to `osmconvert --csv`.
+    /// Unlike `osmconvert`, this **does not** truncate long tag values and create invalid UTF-8.
     GetTags {
         /// The `.osm.pbf` file to use.
         pbf_file: PathBuf,
@@ -46,8 +46,23 @@ enum Cmd {
         threads: Option<isize>,
     },
 
-    /// Apply the same html article simplification used when extracting articles to stdin, and write it to stdout.
+    /// Attempt to parse extracted OSM tags and write errors to stdout in TSV format.
+    CheckTags {
+        /// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
+        ///
+        /// This can be generated with the `get-tags` command or `osmconvert --csv-headline --csv 'wikidata wikipedia'`.
+        #[arg(value_name = "FILE.tsv")]
+        osm_tags: PathBuf,
+    },
+
+    /// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
     ///
+    /// Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
+    GetArticles(get_articles::Args),
+
+    /// Apply html simplification to a single article.
+    ///
+    /// Reads from stdin and writes the simplified version to stdout.
     /// This is meant for testing and debugging.
     Simplify {
         /// The language to use when processing the article (defaults to `en`).
@@ -97,6 +112,42 @@ fn main() -> anyhow::Result<()> {
             let pbf_file = File::open(pbf_file).map(BufReader::new)?;
             get_tags::run(pbf_file)
         }
+        Cmd::CheckTags { osm_tags } => {
+            let mut qids = HashSet::new();
+            let mut titles = HashSet::new();
+            let mut errors = Vec::new();
+            info!("Reading osm tag file");
+            om_wikiparser::wm::parse_osm_tag_file(
+                osm_tags,
+                &mut qids,
+                &mut titles,
+                Some(&mut errors),
+            )?;
+            info!("Found {} errors in tag file", errors.len());
+
+            let mut writer = csv::WriterBuilder::new()
+                .delimiter(b'\t')
+                .from_writer(stdout().lock());
+            writer.write_record(["line", "kind", "osm_id", "error", "value"])?;
+            for error in errors {
+                use om_wikiparser::wm::ParseErrorKind::*;
+                let kind = error.kind.to_string();
+                let id = error
+                    .osm_id
+                    .as_ref()
+                    .map(ToString::to_string)
+                    .unwrap_or_default();
+                let e: anyhow::Error = match error.kind {
+                    Title(e) => e.into(),
+                    Qid(e) => e.into(),
+                    Tsv(e) => e.into(),
+                };
+                let msg = e.to_string();
+                writer.write_record([&error.line.to_string(), &kind, &id, &msg, &error.text])?;
+            }
+
+            Ok(())
+        }
         Cmd::Simplify { lang } => {
             let mut input = String::new();
             stdin().read_to_string(&mut input)?;
diff --git a/src/wm/mod.rs b/src/wm/mod.rs
index 530b41a..d8c3d89 100644
--- a/src/wm/mod.rs
+++ b/src/wm/mod.rs
@@ -72,10 +72,12 @@ pub fn parse_osm_tag_file(
 
     let mut qid_col = None;
     let mut title_col = None;
+    let mut osm_id_col = None;
     for (column, title) in rdr.headers()?.iter().enumerate() {
         match title {
             "wikidata" => qid_col = Some(column),
             "wikipedia" => title_col = Some(column),
+            "@id" => osm_id_col = Some(column),
             _ => (),
         }
     }
@@ -97,12 +99,15 @@ pub fn parse_osm_tag_file(
                 push_error(ParseLineError {
                     text: String::new(),
                     line: rdr.position().line(),
+                    osm_id: None,
                     kind: e.into(),
                 });
                 continue;
             }
         }
 
+        let osm_id = osm_id_col.and_then(|i| row[i].parse().ok());
+
         let qid = &row[qid_col].trim();
         if !qid.is_empty() {
             match Qid::from_str(qid) {
@@ -112,6 +117,7 @@ pub fn parse_osm_tag_file(
                 Err(e) => push_error(ParseLineError {
                     text: qid.to_string(),
                     line: rdr.position().line(),
+                    osm_id,
                     kind: e.into(),
                 }),
             }
@@ -126,6 +132,7 @@ pub fn parse_osm_tag_file(
                 Err(e) => push_error(ParseLineError {
                     text: title.to_string(),
                     line: rdr.position().line(),
+                    osm_id,
                     kind: e.into(),
                 }),
             }
@@ -137,25 +144,31 @@ pub fn parse_osm_tag_file(
 
 #[derive(Debug, thiserror::Error)]
 pub enum ParseErrorKind {
-    #[error("bad title")]
+    #[error("title")]
     Title(#[from] ParseTitleError),
-    #[error("bad QID")]
+    #[error("QID")]
     Qid(#[from] ParseQidError),
-    #[error("bad TSV line")]
+    #[error("TSV line")]
     Tsv(#[from] csv::Error),
 }
 
 #[derive(Debug)]
 pub struct ParseLineError {
-    text: String,
-    line: u64,
-    kind: ParseErrorKind,
+    pub text: String,
+    pub line: u64,
+    pub osm_id: Option<usize>,
+    pub kind: ParseErrorKind,
 }
 
 impl Display for ParseLineError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // write source chain to ensure they are logged
-        write!(f, "on line {}: {:?}: {}", self.line, self.text, self.kind)?;
+        write!(f, "on line {}", self.line)?;
+        if let Some(osm_id) = self.osm_id {
+            write!(f, " ({osm_id})")?;
+        }
+        write!(f, ": {} {:?}", self.kind, self.text)?;
+
+        // Write source error chain to ensure they are logged.
         let mut source = self.kind.source();
         while let Some(e) = source {
             write!(f, ": {}", e)?;
@@ -167,7 +180,7 @@ impl Display for ParseLineError {
 
 impl Error for ParseLineError {
     fn source(&self) -> Option<&(dyn Error + 'static)> {
-        // return nothing b/c Display prints source chain
+        // Return nothing because Display prints source chain.
         None
     }
 }
diff --git a/src/wm/title.rs b/src/wm/title.rs
index 5d0879a..eac49f3 100644
--- a/src/wm/title.rs
+++ b/src/wm/title.rs
@@ -48,7 +48,12 @@ impl Title {
 
     // https://en.wikipedia.org/wiki/Article_Title/More_Title
     pub fn from_url(url: &str) -> Result<Self, ParseTitleError> {
-        let url = Url::parse(url.trim())?;
+        let url = url.trim();
+        if url.is_empty() {
+            return Err(ParseTitleError::Empty);
+        }
+
+        let url = Url::parse(url)?;
 
         let (subdomain, host) = url
             .host_str()
@@ -79,10 +84,11 @@ impl Title {
 
     // en:Article Title
     pub fn from_osm_tag(tag: &str) -> Result<Self, ParseTitleError> {
-        let (lang, title) = tag
-            .trim()
-            .split_once(':')
-            .ok_or(ParseTitleError::MissingColon)?;
+        let tag = tag.trim();
+        if tag.is_empty() {
+            return Err(ParseTitleError::Empty);
+        }
+        let (lang, title) = tag.split_once(':').ok_or(ParseTitleError::MissingColon)?;
 
         let lang = lang.trim_start();
         let title = title.trim_start();
@@ -125,9 +131,11 @@ impl Title {
 
 #[derive(Debug, PartialEq, Eq, thiserror::Error)]
 pub enum ParseTitleError {
-    #[error("title cannot be empty or whitespace")]
+    #[error("value is empty or whitespace")]
+    Empty,
+    #[error("title is empty or whitespace")]
     NoTitle,
-    #[error("lang cannot be empty or whitespace")]
+    #[error("lang is empty or whitespace")]
     NoLang,
     #[error("no ':' separating lang and title")]
     MissingColon,
@@ -141,7 +149,7 @@ pub enum ParseTitleError {
     NoHost,
     #[error("no subdomain in url")]
     NoSubdomain,
-    #[error("url base domain is wikipedia.org")]
+    #[error("url base domain is not wikipedia.org")]
     BadDomain,
     #[error("url base path is not /wiki/")]
     BadPath,