Initial parsing and processing

The html processing should perform both of the main steps handled from the original `descriptions_downloader.py` script: - remove specific sections, e.g. "References" - remove elements with no non-whitespace text Determining how similar the output is will require more testing. A separate binary target is included for standalone html processing. Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
2023-06-01 10:14:46 -04:00 · 2023-06-01 10:14:46 -04:00 · d55d3cc7e0
commit d55d3cc7e0
parent aba31775fa
8 changed files with 1479 additions and 34 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -4,10 +4,20 @@ version = "0.0.0"
 license = "AGPL-3.0-only"
 edition = "2021"
 repository = "https://github.com/organicmaps/wikiparser/"
-
+default-run = "om-wikiparser"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
 anyhow = { version = "1.0.71", features = ["backtrace"] }
+clap = { version = "4.3.2", features = ["derive"] }
+env_logger = "0.10.0"
+log = "0.4.18"
+scraper = "0.16.0"
 serde = { version = "1.0.163", features = ["derive"] }
 serde_json = "1.0.96"
+url = "2.3.1"
+urlencoding = "2.1.2"
+
+[profile.release]
+debug = true
+overflow-checks = true
--- a/src/bin/simplify_html.rs
+++ b/src/bin/simplify_html.rs
@ -0,0 +1,18 @@
+//! Apply html article simplification to stdin, and write it to stdout.
+//!
+//! Usage:
+//!     simplify_html < article.html > simplified.html
+use std::io::{stdin, stdout, Read, Write};
+
+use om_wikiparser::html::simplify;
+
+fn main() -> anyhow::Result<()> {
+    let mut input = String::new();
+    stdin().read_to_string(&mut input)?;
+
+    let output = simplify(&input);
+
+    stdout().write_all(output.as_bytes())?;
+
+    Ok(())
+}
--- a/src/html.rs
+++ b/src/html.rs
@ -0,0 +1,68 @@
+use scraper::{ElementRef, Html, Selector};
+
+pub fn simplify(html: &str) -> String {
+    // TODO: handle multiple languages
+    let bad_sections = [
+        "External links",
+        "Sources",
+        "See also",
+        "Bibliography",
+        "Further reading",
+        "References",
+    ];
+
+    let mut document = Html::parse_document(html);
+
+    // TODO: evaluate this only once
+    let headers = Selector::parse("h1, h2, h3, h4, h5, h6, h7").unwrap();
+
+    let mut to_remove = Vec::new();
+
+    // remove sections
+    for header in document.select(&headers) {
+        // TODO: should this join all text nodes?
+        let Some(title) = header.text().next() else {
+            continue
+        };
+        if bad_sections.contains(&title) {
+            to_remove.push(header.id());
+            let header_level = header.value().name();
+            // strip trailing nodes
+            for sibling in header.next_siblings() {
+                if let Some(element) = sibling.value().as_element() {
+                    if element.name() == header_level {
+                        // TODO: should this check for a higher level?
+                        break;
+                    }
+                }
+                to_remove.push(sibling.id());
+            }
+        }
+    }
+
+    for id in to_remove.drain(..) {
+        if let Some(mut node) = document.tree.get_mut(id) {
+            node.detach();
+        }
+    }
+
+    // remove elements with no text that isn't whitespace
+
+    for element in document
+        .root_element()
+        .descendants()
+        .filter_map(ElementRef::wrap)
+    {
+        if element.text().all(|t| t.trim().is_empty()) {
+            to_remove.push(element.id());
+        }
+    }
+
+    for id in to_remove.drain(..) {
+        if let Some(mut node) = document.tree.get_mut(id) {
+            node.detach();
+        }
+    }
+
+    document.html()
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,2 @@
+pub mod html;
+pub mod wm;
--- a/src/main.rs
+++ b/src/main.rs
@ -1,40 +1,65 @@
 // Usage:
-//     pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO | cargo run --release > /dev/null
+//     # prep outputs from map generator
+//     cut -f 2 ~/Downloads/id_to_wikidata.csv > /tmp/wikidata_ids.txt
+//     tail -n +2 ~/Downloads/wiki_urls.txt | cut -f 3 > /tmp/wikipedia_urls.txt
+//     # feed gzipped tarfile
+//     pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO \
+//     | cargo run --release -- \
+//     --wikidata-ids /tmp/wikidata_ids.txt \
+//     --wikipedia-urls /tmp/wikipedia_urls.txt \
+//     output_dir
+use std::{
+    fs::File,
+    io::{stdin, BufRead, BufReader, Write},
+    path::PathBuf,
+};

-use serde::Deserialize;
-use std::io::{self, stdin, BufRead, BufReader, Write};
+use anyhow::bail;
+use clap::Parser;
+#[macro_use]
+extern crate log;

-#[derive(Deserialize)]
-struct Page {
-    // TODO: check if CoW has a performance impact
-    name: String,
-    date_modified: String,
-    #[serde(default)]
-    url: String,
-    main_entity: Option<Wikidata>,
-    // TODO: see what impact parsing/unescaping/allocating this has
-    article_body: ArticleBody,
-    #[serde(default)]
-    redirects: Vec<Redirect>,
-}
+use om_wikiparser::{
+    html::simplify,
+    wm::{is_wikidata_match, is_wikipedia_match, parse_wikidata_file, parse_wikipedia_file, Page},
+};

-#[derive(Deserialize)]
-struct Wikidata {
-    identifier: String,
-}
-
-#[derive(Deserialize)]
-struct ArticleBody {
-    html: String,
-}
-
-#[derive(Deserialize)]
-struct Redirect {
-    url: String,
-    name: String,
+#[derive(Parser)]
+struct Args {
+    output_dir: PathBuf,
+    #[arg(long)]
+    wikidata_ids: Option<PathBuf>,
+    #[arg(long)]
+    wikipedia_urls: Option<PathBuf>,
 }

 fn main() -> anyhow::Result<()> {
+    env_logger::Builder::new()
+        .filter_level(log::LevelFilter::Info)
+        .parse_default_env()
+        .try_init()?;
+
+    let args = Args::parse();
+
+    info!("Loading urls");
+    let wikipedia_titles = args
+        .wikipedia_urls
+        .map(parse_wikipedia_file)
+        .transpose()?
+        .unwrap_or_default();
+
+    info!("Loading ids");
+    let wikidata_ids = args
+        .wikidata_ids
+        .map(parse_wikidata_file)
+        .transpose()?
+        .unwrap_or_default();
+
+    if !args.output_dir.is_dir() {
+        bail!("output dir {:?} does not exist", args.output_dir)
+    }
+
+    info!("Processing dump");
    let dump = BufReader::new(stdin());

    // TODO: compare different deserialization methods
@ -45,10 +70,33 @@ fn main() -> anyhow::Result<()> {
            .and_then(|s| serde_json::from_str::<Page>(&s).map_err(anyhow::Error::new))
    });

-    let mut stdout = io::stdout();
    for page in stream {
        let page = page?;
-        writeln!(stdout, "{}", page.name)?;
+
+        if !(is_wikidata_match(&wikidata_ids, &page).is_some()
+            || is_wikipedia_match(&wikipedia_titles, &page).is_some())
+        {
+            continue;
+        }
+
+        let Some(qid) = page.main_entity.map(|e| e.identifier) else {
+            warn!("Page in list but without wikidata qid: {:?}", page.name);
+            continue;
+        };
+
+        let filename = args.output_dir.join(qid).with_extension("html");
+
+        debug!("{:?}: {:?}", page.name, filename);
+
+        if filename.exists() {
+            debug!("Exists, skipping");
+            continue;
+        }
+
+        let html = simplify(&page.article_body.html);
+
+        let mut file = File::create(filename)?;
+        file.write_all(html.as_bytes())?;
    }

    Ok(())
--- a/src/wm/mod.rs
+++ b/src/wm/mod.rs
@ -0,0 +1,177 @@
+//! Wikimedia types
+use std::{
+    collections::HashSet,
+    ffi::OsStr,
+    fs::{self},
+    num::ParseIntError,
+    str::FromStr,
+};
+
+use anyhow::{anyhow, bail, Context};
+
+use url::Url;
+
+mod page;
+pub use page::Page;
+
+/// Read from a file of urls on each line.
+pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<WikidataQid>> {
+    let contents = fs::read_to_string(path.as_ref())?;
+    contents
+        .lines()
+        .enumerate()
+        .map(|(i, line)| {
+            WikidataQid::from_str(line).with_context(|| {
+                let line_num = i + 1;
+                format!("bad QID value on line {line_num}: {line:?}")
+            })
+        })
+        .collect()
+}
+
+/// Read article titles from a file of urls on each line.
+pub fn parse_wikipedia_file(
+    path: impl AsRef<OsStr>,
+) -> anyhow::Result<HashSet<WikipediaTitleNorm>> {
+    let contents = fs::read_to_string(path.as_ref())?;
+    contents
+        .lines()
+        .enumerate()
+        .map(|(i, line)| {
+            WikipediaTitleNorm::from_url(line).with_context(|| {
+                let line_num = i + 1;
+                format!("bad wikipedia url on line {line_num}: {line:?}")
+            })
+        })
+        .collect()
+}
+
+pub fn is_wikidata_match(ids: &HashSet<WikidataQid>, page: &Page) -> Option<WikidataQid> {
+    let Some(wikidata) = &page.main_entity else { return None;};
+    let wikidata_id = &wikidata.identifier;
+    let wikidata_id = match WikidataQid::from_str(wikidata_id) {
+        Ok(qid) => qid,
+        Err(e) => {
+            eprintln!("Could not parse QID: {:?}: {}", wikidata_id, e);
+            return None;
+        }
+    };
+
+    ids.get(&wikidata_id).map(|_| wikidata_id)
+}
+
+pub fn is_wikipedia_match(
+    titles: &HashSet<WikipediaTitleNorm>,
+    page: &Page,
+) -> Option<WikipediaTitleNorm> {
+    // TODO: handle multiple languages
+    let title = WikipediaTitleNorm::from_title(&page.name, "en");
+
+    if titles.get(&title).is_some() {
+        return Some(title);
+    }
+
+    for redirect in &page.redirects {
+        let title = WikipediaTitleNorm::from_title(&redirect.name, "en");
+
+        if titles.get(&title).is_some() {
+            return Some(title);
+        }
+    }
+
+    None
+}
+
+/// Wikidata QID/Q Number
+///
+/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
+///
+/// ```
+/// use std::str::FromStr;
+/// use om_wikiparser::wm::WikidataQid;
+///
+/// let with_q = WikidataQid::from_str("Q12345").unwrap();
+/// let without_q = WikidataQid::from_str("12345").unwrap();
+/// assert_eq!(with_q, without_q);
+/// ```
+#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
+pub struct WikidataQid(u32);
+
+impl FromStr for WikidataQid {
+    type Err = ParseIntError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let s = s.strip_prefix('Q').unwrap_or(s);
+        u32::from_str(s).map(WikidataQid)
+    }
+}
+
+/// Normalized wikipedia article title that can compare:
+/// - titles `Spatial Database`
+/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
+/// - osm-style tags `en:Spatial Database`
+///
+/// ```
+/// use om_wikiparser::wm::WikipediaTitleNorm;
+///
+/// let url = WikipediaTitleNorm::from_url("https://en.wikipedia.org/wiki/Article_Title/").unwrap();
+/// let title = WikipediaTitleNorm::from_title("Article Title", "en");
+/// assert_eq!(url, title);
+/// ```
+#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
+pub struct WikipediaTitleNorm {
+    lang: String,
+    name: String,
+}
+
+impl WikipediaTitleNorm {
+    fn normalize_title(title: &str) -> String {
+        // TODO: compare with generator url creation
+        title.replace(' ', "_")
+    }
+
+    // https://en.wikipedia.org/wiki/Article_Title
+    pub fn from_url(url: &str) -> anyhow::Result<Self> {
+        let url = Url::parse(url)?;
+
+        let (subdomain, host) = url
+            .host_str()
+            .ok_or(anyhow!("Expected host"))?
+            .split_once('.')
+            .ok_or(anyhow!("Expected subdomain"))?;
+        if host != "wikipedia.org" {
+            bail!("Expected wikipedia.org for domain")
+        }
+        let lang = subdomain;
+
+        let mut paths = url.path_segments().ok_or(anyhow!("Expected path"))?;
+
+        let root = paths
+            .next()
+            .ok_or(anyhow!("Expected first segment in path"))?;
+
+        if root != "wiki" {
+            bail!("Expected 'wiki' in path")
+        }
+
+        let title = paths
+            .next()
+            .ok_or(anyhow!("Expected second segment in path"))?;
+        let title = urlencoding::decode(title)?;
+
+        Ok(Self::from_title(&title, lang))
+    }
+
+    // en:Article Title
+    fn _from_osm_tag(tag: &str) -> anyhow::Result<Self> {
+        let (lang, title) = tag.split_once(':').ok_or(anyhow!("Expected ':'"))?;
+
+        Ok(Self::from_title(title, lang))
+    }
+
+    pub fn from_title(title: &str, lang: &str) -> Self {
+        let name = Self::normalize_title(title);
+        let lang = lang.to_owned();
+        Self { name, lang }
+    }
+}
--- a/src/wm/page.rs
+++ b/src/wm/page.rs
@ -0,0 +1,36 @@
+use serde::Deserialize;
+
+/// Deserialized Wikimedia Enterprise API Article
+///
+/// For all available fields, see https://enterprise.wikimedia.com/docs/data-dictionary/
+#[allow(dead_code)] // TODO: reevaluate fields
+#[derive(Deserialize)]
+pub struct Page {
+    // TODO: check if CoW has a performance impact
+    pub name: String,
+    pub date_modified: String,
+    #[serde(default)]
+    pub url: String,
+    pub main_entity: Option<Wikidata>,
+    // TODO: see what impact parsing/unescaping/allocating this has
+    pub article_body: ArticleBody,
+    #[serde(default)]
+    pub redirects: Vec<Redirect>,
+}
+
+#[derive(Deserialize)]
+pub struct Wikidata {
+    pub identifier: String,
+}
+
+#[derive(Deserialize)]
+pub struct ArticleBody {
+    pub html: String,
+}
+
+#[allow(dead_code)] // TODO: reevaluate fields
+#[derive(Deserialize)]
+pub struct Redirect {
+    pub url: String,
+    pub name: String,
+}