diff --git a/README.md b/README.md index 9c95dd0..4c8a954 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,45 @@ _Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._ -## Usage +## Configuring [`article_processing_config.json`](article_processing_config.json) should be updated when adding a new language. -It defines article sections that are not important for users and should be removed. +It defines article sections that are not important for users and should be removed from the extracted HTML. + +## Usage + +First, install [the rust language tools](https://www.rust-lang.org/) + +For best performance, use `--release` when building or running. + +You can run the program from within this directory using `cargo run --release --`. + +Alternatively, build it with `cargo build --release`, which places the binary in `./target/release/om-wikiparser`. + +Run the program with the `--help` flag to see all supported arguments. + +It takes as inputs: +- A wikidata enterprise JSON dump, extracted and connected to `stdin`. +- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`. +- A file of Wikipedia article titles to extract, one per line (e.g. `https://$LANG.wikipedia.org/wiki/$ARTICLE_TITLE`), passed as a CLI flag `--wikipedia-urls`. +- A directory to write the extracted articles to, as a CLI argument. + +As an example of usage with the map generator: +- Assuming this program is installed to `$PATH` as `om-wikiparser`. +- Download [the dumps in the desired languages](https://dumps.wikimedia.org/other/enterprise_html/runs/) (Use the files with the format `${LANG}wiki-NS0-${DATE}-ENTERPRISE-HTML.json.tar.gz`). + Set `DUMP_DOWNLOAD_DIR` to the location they are downloaded. +- Run the following from within the `intermediate_data` subdirectory of the maps build directory: + +```shell +# Transform intermediate files from generator. +cut -f 2 id_to_wikidata.csv > wikidata_ids.txt +tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt +# Begin extraction. +for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz +do + tar xzf $dump | om-wikiparser \ + --wikidata-ids wikidata_ids.txt \ + --wikipedia-urls wikipedia_urls.txt \ + descriptions/ +done +``` diff --git a/src/main.rs b/src/main.rs index 6e1c393..30b41aa 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,27 +1,18 @@ -// Usage: -// # prep outputs from map generator -// cut -f 2 ~/Downloads/id_to_wikidata.csv > /tmp/wikidata_ids.txt -// tail -n +2 ~/Downloads/wiki_urls.txt | cut -f 3 > /tmp/wikipedia_urls.txt -// # feed gzipped tarfile -// pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO \ -// | cargo run --release -- \ -// --wikidata-ids /tmp/wikidata_ids.txt \ -// --wikipedia-urls /tmp/wikipedia_urls.txt \ -// output_dir use std::{ - fs::{create_dir, File}, + fs::{self, File}, io::{stdin, BufRead, Write}, + os::unix, path::{Path, PathBuf}, }; -use anyhow::bail; +use anyhow::{anyhow, bail, Context}; use clap::Parser; #[macro_use] extern crate log; use om_wikiparser::{ html::simplify, - wm::{is_wikidata_match, is_wikipedia_match, parse_wikidata_file, parse_wikipedia_file, Page}, + wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm}, }; #[derive(Parser)] @@ -33,33 +24,115 @@ struct Args { wikipedia_urls: Option, } -fn write(dir: impl AsRef, page: Page) -> anyhow::Result<()> { - let Some(qid) = page.main_entity.map(|e| e.identifier) else { - // TODO: handle and still write - bail!("Page in list but without wikidata qid: {:?} ({})", page.name, page.url); +/// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it. +fn create_article_dir( + base: impl AsRef, + page: &Page, + redirects: impl IntoIterator, +) -> anyhow::Result { + let base = base.as_ref(); + let mut redirects = redirects.into_iter(); + + let main_dir = match page.wikidata() { + None => { + // Write to wikipedia title directory. + // Prefer first redirect, fall back to page title if none exist + info!("Page without wikidata qid: {:?} ({})", page.name, page.url); + redirects + .next() + .or_else(|| match page.title() { + Ok(title) => Some(title), + Err(e) => { + warn!("Unable to parse title for page {:?}: {:#}", page.name, e); + None + } + }) + // hard fail when no titles can be parsed + .ok_or_else(|| anyhow!("No available titles for page {:?}", page.name))? + .get_dir(base.to_owned()) + } + Some(qid) => { + // Otherwise use wikidata as main directory and symlink from wikipedia titles. + qid.get_dir(base.to_owned()) + } }; - let mut filename = dir.as_ref().to_owned(); - filename.push(qid); + if main_dir.is_symlink() { + fs::remove_file(&main_dir) + .with_context(|| format!("removing old link for main directory {:?}", &main_dir))?; + } + fs::create_dir_all(&main_dir) + .with_context(|| format!("creating main directory {:?}", &main_dir))?; + + // Write symlinks to main directory. + // TODO: Only write redirects that we care about. + for title in redirects { + let wikipedia_dir = title.get_dir(base.to_owned()); + + // Build required directory. + // + // Possible states from previous run: + // - Does not exist (and is not a symlink) + // - Exists, is a directory + // - Exists, is a valid symlink to correct location + // - Exists, is a valid symlink to incorrect location + if wikipedia_dir.exists() { + if wikipedia_dir.is_symlink() { + // Only replace if not valid + if fs::read_link(&wikipedia_dir)? == main_dir { + continue; + } + fs::remove_file(&wikipedia_dir)?; + } else { + fs::remove_dir_all(&wikipedia_dir)?; + } + } else { + // titles can contain `/`, so ensure necessary subdirs exist + let parent_dir = wikipedia_dir.parent().unwrap(); + fs::create_dir_all(parent_dir) + .with_context(|| format!("creating wikipedia directory {:?}", parent_dir))?; + } + + unix::fs::symlink(&main_dir, &wikipedia_dir).with_context(|| { + format!( + "creating symlink from {:?} to {:?}", + wikipedia_dir, main_dir + ) + })?; + } + + Ok(main_dir) +} + +/// Write selected article to disk. +/// +/// - Write page contents to wikidata page (`wikidata.org/wiki/QXXX/lang.html`). +/// - If the page has no wikidata qid, write contents to wikipedia location (`lang.wikipedia.org/wiki/article_title/lang.html`). +/// - Create links from all wikipedia urls and redirects (`lang.wikipedia.org/wiki/a_redirect -> wikidata.org/wiki/QXXX`). +fn write( + base: impl AsRef, + page: &Page, + redirects: impl IntoIterator, +) -> anyhow::Result<()> { + let article_dir = create_article_dir(base, page, redirects)?; + + // Write html to determined file. + let mut filename = article_dir; filename.push(&page.in_language.identifier); filename.set_extension("html"); debug!("{:?}: {:?}", page.name, filename); if filename.exists() { - debug!("Exists, skipping"); - return Ok(()); - } - - let subfolder = filename.parent().unwrap(); - if !subfolder.exists() { - create_dir(subfolder)?; + debug!("Overwriting existing file"); } let html = simplify(&page.article_body.html, &page.in_language.identifier); - let mut file = File::create(&filename)?; - file.write_all(html.as_bytes())?; + let mut file = + File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?; + file.write_all(html.as_bytes()) + .with_context(|| format!("writing html file {:?}", filename))?; Ok(()) } @@ -104,14 +177,28 @@ fn main() -> anyhow::Result<()> { for page in stream { let page = page?; - if !(is_wikidata_match(&wikidata_ids, &page).is_some() - || is_wikipedia_match(&wikipedia_titles, &page).is_some()) - { + let is_wikidata_match = page + .wikidata() + .map(|qid| wikidata_ids.contains(&qid)) + .unwrap_or_default(); + + let matching_titles = page + .all_titles() + .filter_map(|r| { + r.map(Some).unwrap_or_else(|e| { + warn!("Could not parse title for {:?}: {:#}", &page.name, e); + None + }) + }) + .filter(|t| wikipedia_titles.contains(t)) + .collect::>(); + + if !is_wikidata_match && matching_titles.is_empty() { continue; } - if let Err(e) = write(&args.output_dir, page) { - error!("Error writing article: {}", e); + if let Err(e) = write(&args.output_dir, &page, matching_titles) { + error!("Error writing article {:?}: {:#}", page.name, e); } } diff --git a/src/wm/mod.rs b/src/wm/mod.rs index 6625691..2a76e6d 100644 --- a/src/wm/mod.rs +++ b/src/wm/mod.rs @@ -1,5 +1,8 @@ //! Wikimedia types -use std::{collections::HashSet, ffi::OsStr, fs, num::ParseIntError, str::FromStr}; +use std::{ + collections::HashSet, ffi::OsStr, fmt::Display, fs, num::ParseIntError, path::PathBuf, + str::FromStr, +}; use anyhow::{anyhow, bail, Context}; @@ -40,53 +43,6 @@ pub fn parse_wikipedia_file( .collect() } -pub fn is_wikidata_match(ids: &HashSet, page: &Page) -> Option { - let Some(wikidata) = &page.main_entity else { return None;}; - let wikidata_id = &wikidata.identifier; - let wikidata_id = match WikidataQid::from_str(wikidata_id) { - Ok(qid) => qid, - Err(e) => { - warn!( - "Could not parse QID for {:?}: {:?}: {:#}", - page.name, wikidata_id, e - ); - return None; - } - }; - - ids.get(&wikidata_id).map(|_| wikidata_id) -} - -pub fn is_wikipedia_match( - titles: &HashSet, - page: &Page, -) -> Option { - match WikipediaTitleNorm::from_title(&page.name, &page.in_language.identifier) { - Err(e) => warn!("Could not parse title for {:?}: {:#}", page.name, e), - Ok(title) => { - if titles.get(&title).is_some() { - return Some(title); - } - } - } - - for redirect in &page.redirects { - match WikipediaTitleNorm::from_title(&redirect.name, &page.in_language.identifier) { - Err(e) => warn!( - "Could not parse redirect title for {:?}: {:?}: {:#}", - page.name, redirect.name, e - ), - Ok(title) => { - if titles.get(&title).is_some() { - return Some(title); - } - } - } - } - - None -} - /// Wikidata QID/Q Number /// /// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID @@ -118,6 +74,23 @@ impl FromStr for WikidataQid { } } +impl Display for WikidataQid { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Q{}", self.0) + } +} + +impl WikidataQid { + pub fn get_dir(&self, base: PathBuf) -> PathBuf { + let mut path = base; + path.push("wikidata"); + // TODO: can use as_mut_os_string with 1.70.0 + path.push(self.to_string()); + + path + } +} + /// Normalized wikipedia article title that can compare: /// - titles `Spatial Database` /// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase` @@ -132,6 +105,11 @@ impl FromStr for WikidataQid { /// /// assert!(WikipediaTitleNorm::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err()); /// assert!(WikipediaTitleNorm::from_url("https://wikidata.org/wiki/Q12345").is_err()); +/// +/// assert!( +/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() != +/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil").unwrap() +/// ); /// ``` #[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)] pub struct WikipediaTitleNorm { @@ -145,7 +123,7 @@ impl WikipediaTitleNorm { title.trim().replace(' ', "_") } - // https://en.wikipedia.org/wiki/Article_Title + // https://en.wikipedia.org/wiki/Article_Title/More_Title pub fn from_url(url: &str) -> anyhow::Result { let url = Url::parse(url.trim())?; @@ -159,21 +137,17 @@ impl WikipediaTitleNorm { } let lang = subdomain; - let mut paths = url - .path_segments() - .ok_or_else(|| anyhow!("Expected path"))?; + let path = url.path(); - let root = paths - .next() - .ok_or_else(|| anyhow!("Expected first segment in path"))?; + let (root, title) = path + .strip_prefix('/') + .unwrap_or(path) + .split_once('/') + .ok_or_else(|| anyhow!("Expected at least two segments in path"))?; if root != "wiki" { - bail!("Expected 'wiki' in path") + bail!("Expected 'wiki' as root path, got: {:?}", root) } - - let title = paths - .next() - .ok_or_else(|| anyhow!("Expected second segment in path"))?; let title = urlencoding::decode(title)?; Self::from_title(&title, lang) @@ -202,4 +176,14 @@ impl WikipediaTitleNorm { let lang = lang.to_owned(); Ok(Self { name, lang }) } + + pub fn get_dir(&self, base: PathBuf) -> PathBuf { + let mut path = base; + // TODO: can use as_mut_os_string with 1.70.0 + path.push(format!("{}.wikipedia.org", self.lang)); + path.push("wiki"); + path.push(&self.name); + + path + } } diff --git a/src/wm/page.rs b/src/wm/page.rs index d680be5..85b6647 100644 --- a/src/wm/page.rs +++ b/src/wm/page.rs @@ -1,5 +1,9 @@ +use std::{iter, str::FromStr}; + use serde::Deserialize; +use super::{WikidataQid, WikipediaTitleNorm}; + // TODO: consolidate into single struct /// Deserialized Wikimedia Enterprise API Article /// @@ -20,6 +24,31 @@ pub struct Page { pub redirects: Vec, } +impl Page { + pub fn wikidata(&self) -> Option { + // TODO: return error + self.main_entity + .as_ref() + .map(|e| WikidataQid::from_str(&e.identifier).unwrap()) + } + + /// Title of the article + pub fn title(&self) -> anyhow::Result { + WikipediaTitleNorm::from_title(&self.name, &self.in_language.identifier) + } + + /// All titles that lead to the article, the main title followed by any redirects. + pub fn all_titles(&self) -> impl Iterator> + '_ { + iter::once(self.title()).chain(self.redirects()) + } + + pub fn redirects(&self) -> impl Iterator> + '_ { + self.redirects + .iter() + .map(|r| WikipediaTitleNorm::from_title(&r.name, &self.in_language.identifier)) + } +} + #[derive(Deserialize)] pub struct Wikidata { pub identifier: String,