Write to generator-compatible folder structure (#6)

The map generator expects a certain folder structure created by the
current scraper to add the article content into the mwm files.

- Article html is written to wikidata directory.
- Directories are created for any matched titles and symlinked to the
  wikidata directory.
- Articles without a QID are written to article title directory.
- Article titles containing `/` are not escaped, so multiple
  subdirectories are possible.

The output folder hierarchy looks like this:

    .
    ├── de.wikipedia.org
    │  └── wiki
    │     ├── Coal_River_Springs_Territorial_Park
    │     │  ├── de.html
    │     │  └── ru.html
    │     ├── Ni'iinlii_Njik_(Fishing_Branch)_Territorial_Park
    │     │  ├── de.html
    │     │  └── en.html
    │    ...
    ├── en.wikipedia.org
    │  └── wiki
    │     ├── Arctic_National_Wildlife_Refuge
    │     │  ├── de.html
    │     │  ├── en.html
    │     │  ├── es.html
    │     │  ├── fr.html
    │     │  └── ru.html
    │     ├── Baltimore
    │     │  └── Washington_International_Airport
    │     │     ├── de.html
    │     │     ├── en.html
    │     │     ├── es.html
    │     │     ├── fr.html
    │     │     └── ru.html
    │    ...
    └── wikidata
       ├── Q59320
       │  ├── de.html
       │  ├── en.html
       │  ├── es.html
       │  ├── fr.html
       │  └── ru.html
       ├── Q120306
       │  ├── de.html
       │  ├── en.html
       │  ├── es.html
       │  ├── fr.html
       │  └── ru.html
      ...

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
This commit is contained in:
Evan Lloyd New-Schmidt 2023-07-10 10:34:20 -04:00 committed by GitHub
parent bb1f897cd2
commit 9036e3413f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 233 additions and 95 deletions

View file

@ -2,7 +2,45 @@
_Extracts articles from [Wikipedia database dumps](https://en.wikipedia.org/wiki/Wikipedia:Database_download) for embedding into the `mwm` map files created by [the Organic Maps generator](https://github.com/organicmaps/organicmaps/blob/master/tools/python/maps_generator/README.md)._
## Usage
## Configuring
[`article_processing_config.json`](article_processing_config.json) should be updated when adding a new language.
It defines article sections that are not important for users and should be removed.
It defines article sections that are not important for users and should be removed from the extracted HTML.
## Usage
First, install [the rust language tools](https://www.rust-lang.org/)
For best performance, use `--release` when building or running.
You can run the program from within this directory using `cargo run --release --`.
Alternatively, build it with `cargo build --release`, which places the binary in `./target/release/om-wikiparser`.
Run the program with the `--help` flag to see all supported arguments.
It takes as inputs:
- A wikidata enterprise JSON dump, extracted and connected to `stdin`.
- A file of Wikidata QIDs to extract, one per line (e.g. `Q12345`), passed as the CLI flag `--wikidata-ids`.
- A file of Wikipedia article titles to extract, one per line (e.g. `https://$LANG.wikipedia.org/wiki/$ARTICLE_TITLE`), passed as a CLI flag `--wikipedia-urls`.
- A directory to write the extracted articles to, as a CLI argument.
As an example of usage with the map generator:
- Assuming this program is installed to `$PATH` as `om-wikiparser`.
- Download [the dumps in the desired languages](https://dumps.wikimedia.org/other/enterprise_html/runs/) (Use the files with the format `${LANG}wiki-NS0-${DATE}-ENTERPRISE-HTML.json.tar.gz`).
Set `DUMP_DOWNLOAD_DIR` to the location they are downloaded.
- Run the following from within the `intermediate_data` subdirectory of the maps build directory:
```shell
# Transform intermediate files from generator.
cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
# Begin extraction.
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
do
tar xzf $dump | om-wikiparser \
--wikidata-ids wikidata_ids.txt \
--wikipedia-urls wikipedia_urls.txt \
descriptions/
done
```

View file

@ -1,27 +1,18 @@
// Usage:
// # prep outputs from map generator
// cut -f 2 ~/Downloads/id_to_wikidata.csv > /tmp/wikidata_ids.txt
// tail -n +2 ~/Downloads/wiki_urls.txt | cut -f 3 > /tmp/wikipedia_urls.txt
// # feed gzipped tarfile
// pv ~/Downloads/enwiki-NS0-20230401-ENTERPRISE-HTML.json.tar.gz | tar xzO \
// | cargo run --release -- \
// --wikidata-ids /tmp/wikidata_ids.txt \
// --wikipedia-urls /tmp/wikipedia_urls.txt \
// output_dir
use std::{
fs::{create_dir, File},
fs::{self, File},
io::{stdin, BufRead, Write},
os::unix,
path::{Path, PathBuf},
};
use anyhow::bail;
use anyhow::{anyhow, bail, Context};
use clap::Parser;
#[macro_use]
extern crate log;
use om_wikiparser::{
html::simplify,
wm::{is_wikidata_match, is_wikipedia_match, parse_wikidata_file, parse_wikipedia_file, Page},
wm::{parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
};
#[derive(Parser)]
@ -33,33 +24,115 @@ struct Args {
wikipedia_urls: Option<PathBuf>,
}
fn write(dir: impl AsRef<Path>, page: Page) -> anyhow::Result<()> {
let Some(qid) = page.main_entity.map(|e| e.identifier) else {
// TODO: handle and still write
bail!("Page in list but without wikidata qid: {:?} ({})", page.name, page.url);
/// Determine the directory to write the article contents to, create it, and create any necessary symlinks to it.
fn create_article_dir(
base: impl AsRef<Path>,
page: &Page,
redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
) -> anyhow::Result<PathBuf> {
let base = base.as_ref();
let mut redirects = redirects.into_iter();
let main_dir = match page.wikidata() {
None => {
// Write to wikipedia title directory.
// Prefer first redirect, fall back to page title if none exist
info!("Page without wikidata qid: {:?} ({})", page.name, page.url);
redirects
.next()
.or_else(|| match page.title() {
Ok(title) => Some(title),
Err(e) => {
warn!("Unable to parse title for page {:?}: {:#}", page.name, e);
None
}
})
// hard fail when no titles can be parsed
.ok_or_else(|| anyhow!("No available titles for page {:?}", page.name))?
.get_dir(base.to_owned())
}
Some(qid) => {
// Otherwise use wikidata as main directory and symlink from wikipedia titles.
qid.get_dir(base.to_owned())
}
};
let mut filename = dir.as_ref().to_owned();
filename.push(qid);
if main_dir.is_symlink() {
fs::remove_file(&main_dir)
.with_context(|| format!("removing old link for main directory {:?}", &main_dir))?;
}
fs::create_dir_all(&main_dir)
.with_context(|| format!("creating main directory {:?}", &main_dir))?;
// Write symlinks to main directory.
// TODO: Only write redirects that we care about.
for title in redirects {
let wikipedia_dir = title.get_dir(base.to_owned());
// Build required directory.
//
// Possible states from previous run:
// - Does not exist (and is not a symlink)
// - Exists, is a directory
// - Exists, is a valid symlink to correct location
// - Exists, is a valid symlink to incorrect location
if wikipedia_dir.exists() {
if wikipedia_dir.is_symlink() {
// Only replace if not valid
if fs::read_link(&wikipedia_dir)? == main_dir {
continue;
}
fs::remove_file(&wikipedia_dir)?;
} else {
fs::remove_dir_all(&wikipedia_dir)?;
}
} else {
// titles can contain `/`, so ensure necessary subdirs exist
let parent_dir = wikipedia_dir.parent().unwrap();
fs::create_dir_all(parent_dir)
.with_context(|| format!("creating wikipedia directory {:?}", parent_dir))?;
}
unix::fs::symlink(&main_dir, &wikipedia_dir).with_context(|| {
format!(
"creating symlink from {:?} to {:?}",
wikipedia_dir, main_dir
)
})?;
}
Ok(main_dir)
}
/// Write selected article to disk.
///
/// - Write page contents to wikidata page (`wikidata.org/wiki/QXXX/lang.html`).
/// - If the page has no wikidata qid, write contents to wikipedia location (`lang.wikipedia.org/wiki/article_title/lang.html`).
/// - Create links from all wikipedia urls and redirects (`lang.wikipedia.org/wiki/a_redirect -> wikidata.org/wiki/QXXX`).
fn write(
base: impl AsRef<Path>,
page: &Page,
redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
) -> anyhow::Result<()> {
let article_dir = create_article_dir(base, page, redirects)?;
// Write html to determined file.
let mut filename = article_dir;
filename.push(&page.in_language.identifier);
filename.set_extension("html");
debug!("{:?}: {:?}", page.name, filename);
if filename.exists() {
debug!("Exists, skipping");
return Ok(());
}
let subfolder = filename.parent().unwrap();
if !subfolder.exists() {
create_dir(subfolder)?;
debug!("Overwriting existing file");
}
let html = simplify(&page.article_body.html, &page.in_language.identifier);
let mut file = File::create(&filename)?;
file.write_all(html.as_bytes())?;
let mut file =
File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;
file.write_all(html.as_bytes())
.with_context(|| format!("writing html file {:?}", filename))?;
Ok(())
}
@ -104,14 +177,28 @@ fn main() -> anyhow::Result<()> {
for page in stream {
let page = page?;
if !(is_wikidata_match(&wikidata_ids, &page).is_some()
|| is_wikipedia_match(&wikipedia_titles, &page).is_some())
{
let is_wikidata_match = page
.wikidata()
.map(|qid| wikidata_ids.contains(&qid))
.unwrap_or_default();
let matching_titles = page
.all_titles()
.filter_map(|r| {
r.map(Some).unwrap_or_else(|e| {
warn!("Could not parse title for {:?}: {:#}", &page.name, e);
None
})
})
.filter(|t| wikipedia_titles.contains(t))
.collect::<Vec<_>>();
if !is_wikidata_match && matching_titles.is_empty() {
continue;
}
if let Err(e) = write(&args.output_dir, page) {
error!("Error writing article: {}", e);
if let Err(e) = write(&args.output_dir, &page, matching_titles) {
error!("Error writing article {:?}: {:#}", page.name, e);
}
}

View file

@ -1,5 +1,8 @@
//! Wikimedia types
use std::{collections::HashSet, ffi::OsStr, fs, num::ParseIntError, str::FromStr};
use std::{
collections::HashSet, ffi::OsStr, fmt::Display, fs, num::ParseIntError, path::PathBuf,
str::FromStr,
};
use anyhow::{anyhow, bail, Context};
@ -40,53 +43,6 @@ pub fn parse_wikipedia_file(
.collect()
}
pub fn is_wikidata_match(ids: &HashSet<WikidataQid>, page: &Page) -> Option<WikidataQid> {
let Some(wikidata) = &page.main_entity else { return None;};
let wikidata_id = &wikidata.identifier;
let wikidata_id = match WikidataQid::from_str(wikidata_id) {
Ok(qid) => qid,
Err(e) => {
warn!(
"Could not parse QID for {:?}: {:?}: {:#}",
page.name, wikidata_id, e
);
return None;
}
};
ids.get(&wikidata_id).map(|_| wikidata_id)
}
pub fn is_wikipedia_match(
titles: &HashSet<WikipediaTitleNorm>,
page: &Page,
) -> Option<WikipediaTitleNorm> {
match WikipediaTitleNorm::from_title(&page.name, &page.in_language.identifier) {
Err(e) => warn!("Could not parse title for {:?}: {:#}", page.name, e),
Ok(title) => {
if titles.get(&title).is_some() {
return Some(title);
}
}
}
for redirect in &page.redirects {
match WikipediaTitleNorm::from_title(&redirect.name, &page.in_language.identifier) {
Err(e) => warn!(
"Could not parse redirect title for {:?}: {:?}: {:#}",
page.name, redirect.name, e
),
Ok(title) => {
if titles.get(&title).is_some() {
return Some(title);
}
}
}
}
None
}
/// Wikidata QID/Q Number
///
/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
@ -118,6 +74,23 @@ impl FromStr for WikidataQid {
}
}
impl Display for WikidataQid {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Q{}", self.0)
}
}
impl WikidataQid {
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
let mut path = base;
path.push("wikidata");
// TODO: can use as_mut_os_string with 1.70.0
path.push(self.to_string());
path
}
}
/// Normalized wikipedia article title that can compare:
/// - titles `Spatial Database`
/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
@ -132,6 +105,11 @@ impl FromStr for WikidataQid {
///
/// assert!(WikipediaTitleNorm::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
/// assert!(WikipediaTitleNorm::from_url("https://wikidata.org/wiki/Q12345").is_err());
///
/// assert!(
/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() !=
/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil").unwrap()
/// );
/// ```
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct WikipediaTitleNorm {
@ -145,7 +123,7 @@ impl WikipediaTitleNorm {
title.trim().replace(' ', "_")
}
// https://en.wikipedia.org/wiki/Article_Title
// https://en.wikipedia.org/wiki/Article_Title/More_Title
pub fn from_url(url: &str) -> anyhow::Result<Self> {
let url = Url::parse(url.trim())?;
@ -159,21 +137,17 @@ impl WikipediaTitleNorm {
}
let lang = subdomain;
let mut paths = url
.path_segments()
.ok_or_else(|| anyhow!("Expected path"))?;
let path = url.path();
let root = paths
.next()
.ok_or_else(|| anyhow!("Expected first segment in path"))?;
let (root, title) = path
.strip_prefix('/')
.unwrap_or(path)
.split_once('/')
.ok_or_else(|| anyhow!("Expected at least two segments in path"))?;
if root != "wiki" {
bail!("Expected 'wiki' in path")
bail!("Expected 'wiki' as root path, got: {:?}", root)
}
let title = paths
.next()
.ok_or_else(|| anyhow!("Expected second segment in path"))?;
let title = urlencoding::decode(title)?;
Self::from_title(&title, lang)
@ -202,4 +176,14 @@ impl WikipediaTitleNorm {
let lang = lang.to_owned();
Ok(Self { name, lang })
}
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
let mut path = base;
// TODO: can use as_mut_os_string with 1.70.0
path.push(format!("{}.wikipedia.org", self.lang));
path.push("wiki");
path.push(&self.name);
path
}
}

View file

@ -1,5 +1,9 @@
use std::{iter, str::FromStr};
use serde::Deserialize;
use super::{WikidataQid, WikipediaTitleNorm};
// TODO: consolidate into single struct
/// Deserialized Wikimedia Enterprise API Article
///
@ -20,6 +24,31 @@ pub struct Page {
pub redirects: Vec<Redirect>,
}
impl Page {
pub fn wikidata(&self) -> Option<WikidataQid> {
// TODO: return error
self.main_entity
.as_ref()
.map(|e| WikidataQid::from_str(&e.identifier).unwrap())
}
/// Title of the article
pub fn title(&self) -> anyhow::Result<WikipediaTitleNorm> {
WikipediaTitleNorm::from_title(&self.name, &self.in_language.identifier)
}
/// All titles that lead to the article, the main title followed by any redirects.
pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
iter::once(self.title()).chain(self.redirects())
}
pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
self.redirects
.iter()
.map(|r| WikipediaTitleNorm::from_title(&r.name, &self.in_language.identifier))
}
}
#[derive(Deserialize)]
pub struct Wikidata {
pub identifier: String,